├── docs
├── .nojekyll
├── 404.html
├── demo.html
└── index.html
├── gpu-hot.png
├── version.py
├── static
├── favicon.svg
└── js
│ ├── app.js
│ ├── ui.js
│ ├── chart-config.js
│ ├── socket-handlers.js
│ └── chart-manager.js
├── requirements.txt
├── core
├── metrics
│ ├── __init__.py
│ ├── utils.py
│ └── collector.py
├── __init__.py
├── config.py
├── hub_handlers.py
├── handlers.py
├── hub.py
├── nvidia_smi_fallback.py
└── monitor.py
├── .dockerignore
├── tests
├── Dockerfile.test
├── docker-compose.test.yml
├── README.md
└── test_cluster.py
├── docker-compose.yml
├── .editorconfig
├── Dockerfile
├── LICENSE
├── .gitignore
├── .github
└── workflows
│ └── publish.yml
├── README.md
├── app.py
└── templates
└── index.html
/docs/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gpu-hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/psalias2006/gpu-hot/HEAD/gpu-hot.png
--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | """Version information for GPU Hot"""
2 |
3 | __version__ = "1.6.0"
4 |
5 |
--------------------------------------------------------------------------------
/static/favicon.svg:
--------------------------------------------------------------------------------
1 |
4 |
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.104.1
2 | uvicorn[standard]==0.24.0
3 | websockets==12.0
4 | psutil==5.9.6
5 | nvidia-ml-py==13.580.82
6 | requests==2.31.0
7 | websocket-client==1.6.3
8 | aiohttp==3.9.1
--------------------------------------------------------------------------------
/core/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | GPU Metrics Collection
3 | Organized collection of GPU metrics from NVML
4 | """
5 |
6 | from .collector import MetricsCollector
7 |
8 | __all__ = ['MetricsCollector']
9 |
10 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.pyc
3 | *.pyo
4 | *.pyd
5 | .Python
6 | *.so
7 | *.egg
8 | *.egg-info/
9 | dist/
10 | build/
11 | .git/
12 | .gitignore
13 | *.md
14 | !README.md
15 | docs/
16 | *.png
17 | LICENSE
18 | .DS_Store
19 |
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | GPU Hot - Core Package
3 | Real-time NVIDIA GPU monitoring application
4 | """
5 |
6 | __version__ = '1.0.0'
7 | __author__ = 'GPU Hot Team'
8 |
9 | from .monitor import GPUMonitor
10 | from . import config
11 |
12 | __all__ = ['GPUMonitor', 'config']
13 |
14 |
--------------------------------------------------------------------------------
/tests/Dockerfile.test:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 |
3 | WORKDIR /app
4 |
5 | # Install dependencies
6 | RUN pip install --no-cache-dir \
7 | fastapi==0.104.1 \
8 | uvicorn[standard]==0.24.0 \
9 | websockets==12.0
10 |
11 | # Copy test script
12 | COPY tests/test_cluster.py .
13 |
14 | # Expose port range for mock nodes (default: 13120-13150)
15 | EXPOSE 13120-13150
16 |
17 | # Default: 3 nodes with 2,4,8 GPUs
18 | ENV NODES="2,4,8"
19 | ENV BASE_PORT="13120"
20 | ENV PREFIX="gpu-server"
21 |
22 | CMD python3 test_cluster.py --nodes ${NODES} --base-port ${BASE_PORT} --prefix ${PREFIX}
23 |
24 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 |
2 | services:
3 | gpu-hot:
4 | build: .
5 | ports:
6 | - "1312:1312"
7 | environment:
8 | - NVIDIA_VISIBLE_DEVICES=all
9 | - NVIDIA_DRIVER_CAPABILITIES=all
10 | - NODE_NAME=${HOSTNAME}
11 | deploy:
12 | resources:
13 | reservations:
14 | devices:
15 | - driver: nvidia
16 | count: all
17 | capabilities: [gpu]
18 | init: true
19 | pid: "host"
20 | restart: unless-stopped
21 | healthcheck:
22 | test: ["CMD", "curl", "-f", "http://localhost:1312/api/gpu-data"]
23 | interval: 30s
24 | timeout: 10s
25 | retries: 3
26 | start_period: 40s
27 |
28 |
--------------------------------------------------------------------------------
/core/metrics/utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions for metrics collection"""
2 |
3 | import pynvml
4 |
5 |
6 | def safe_get(func, *args, default=None):
7 | """Safely call NVML function, returns default if unsupported"""
8 | try:
9 | result = func(*args)
10 | return result if result is not None else default
11 | except (pynvml.NVMLError, Exception):
12 | return default
13 |
14 |
15 | def decode_bytes(value):
16 | """Decode bytes to string if necessary"""
17 | return value.decode('utf-8') if isinstance(value, bytes) else value
18 |
19 |
20 | def to_mib(bytes_value):
21 | """Convert bytes to MiB"""
22 | return float(bytes_value / (1024 ** 2))
23 |
24 |
25 | def to_watts(milliwatts):
26 | """Convert milliwatts to watts"""
27 | return float(milliwatts / 1000.0)
28 |
29 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # EditorConfig is awesome: https://EditorConfig.org
2 |
3 | # top-most EditorConfig file
4 | root = true
5 |
6 | # Unix-style newlines with a newline ending every file
7 | [*]
8 | end_of_line = lf
9 | insert_final_newline = true
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 |
13 | # Python files
14 | [*.py]
15 | indent_style = space
16 | indent_size = 4
17 | max_line_length = 120
18 |
19 | # HTML/CSS/JS files
20 | [*.{html,css,js}]
21 | indent_style = space
22 | indent_size = 2
23 |
24 | # YAML files
25 | [*.{yml,yaml}]
26 | indent_style = space
27 | indent_size = 2
28 |
29 | # Markdown files
30 | [*.md]
31 | trim_trailing_whitespace = false
32 | max_line_length = off
33 |
34 | # Dockerfile
35 | [Dockerfile]
36 | indent_style = space
37 | indent_size = 2
38 |
39 | # Shell scripts
40 | [*.sh]
41 | indent_style = space
42 | indent_size = 2
43 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
2 |
3 | # Set environment variables
4 | ENV DEBIAN_FRONTEND=noninteractive
5 | ENV PYTHONUNBUFFERED=1
6 |
7 | # Install system dependencies
8 | RUN apt-get update && apt-get install -y \
9 | python3 \
10 | python3-pip \
11 | curl \
12 | && rm -rf /var/lib/apt/lists/*
13 |
14 | # Set working directory
15 | WORKDIR /app
16 |
17 | # Copy requirements and install Python dependencies
18 | COPY requirements.txt .
19 | RUN pip3 install --no-cache-dir -r requirements.txt
20 |
21 | # Copy application code
22 | COPY . .
23 |
24 | # Create templates directory if it doesn't exist
25 | RUN mkdir -p templates
26 |
27 | # Expose port
28 | EXPOSE 1312
29 |
30 | # Health check
31 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
32 | CMD curl -f http://localhost:1312/api/gpu-data || exit 1
33 |
34 | # Run the application
35 | CMD ["python3", "app.py"]
36 |
37 |
--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration settings for GPU Hot
3 | """
4 |
5 | import os
6 | import socket
7 |
8 | # Flask Configuration
9 | SECRET_KEY = 'gpu_hot_secret'
10 | HOST = '0.0.0.0'
11 | PORT = 1312
12 | DEBUG = False
13 |
14 | # Monitoring Configuration
15 | UPDATE_INTERVAL = 0.5 # Update interval for NVML (sub-second monitoring)
16 | NVIDIA_SMI_INTERVAL = 2.0 # Update interval for nvidia-smi fallback (slower to reduce overhead)
17 |
18 | # GPU Monitoring Mode
19 | # Can be set via environment variable: NVIDIA_SMI=true
20 | NVIDIA_SMI = os.getenv('NVIDIA_SMI', 'false').lower() == 'true'
21 |
22 | # Multi-Node Configuration
23 | # MODE: default (single node monitoring), hub (aggregate multiple nodes)
24 | MODE = os.getenv('GPU_HOT_MODE', 'default')
25 | NODE_NAME = os.getenv('NODE_NAME', socket.gethostname())
26 | # NODE_URLS: comma-separated URLs for hub mode (e.g., http://node1:1312,http://node2:1312)
27 | NODE_URLS = [url.strip() for url in os.getenv('NODE_URLS', '').split(',') if url.strip()]
28 |
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 psalias
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.so
6 | .Python
7 | *.egg
8 | *.egg-info/
9 | dist/
10 | build/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | pip-wheel-metadata/
20 | share/python-wheels/
21 | *.manifest
22 | *.spec
23 |
24 | # Virtual Environment
25 | venv/
26 | env/
27 | ENV/
28 | env.bak/
29 | venv.bak/
30 | .venv/
31 |
32 | # IDE / Editor
33 | .vscode/
34 | .idea/
35 | *.swp
36 | *.swo
37 | *~
38 | .project
39 | .pydevproject
40 | .settings/
41 | *.sublime-project
42 | *.sublime-workspace
43 | .DS_Store
44 |
45 | # Flask
46 | instance/
47 | .webassets-cache
48 | .env
49 | .flaskenv
50 |
51 | # Testing
52 | .pytest_cache/
53 | .coverage
54 | .coverage.*
55 | htmlcov/
56 | .tox/
57 | .nox/
58 | .hypothesis/
59 |
60 | # Logs
61 | *.log
62 | logs/
63 | *.log.*
64 |
65 | # Database
66 | *.db
67 | *.sqlite
68 | *.sqlite3
69 |
70 | # OS
71 | .DS_Store
72 | .DS_Store?
73 | ._*
74 | .Spotlight-V100
75 | .Trashes
76 | ehthumbs.db
77 | Thumbs.db
78 |
79 | # Docker
80 | docker-compose.override.yml
81 |
82 | # Environment variables
83 | .env
84 | .env.local
85 | .env.*.local
86 |
87 | # Temporary files
88 | *.tmp
89 | *.temp
90 | tmp/
91 | temp/
92 |
93 | # MacOS
94 | .AppleDouble
95 | .LSOverride
96 | Icon
97 |
--------------------------------------------------------------------------------
/static/js/app.js:
--------------------------------------------------------------------------------
1 | /**
2 | * GPU Hot - Main Application
3 | * Initializes the application when the DOM is ready
4 | */
5 |
6 | // Application initialization
7 | document.addEventListener('DOMContentLoaded', function() {
8 | console.log('GPU Hot application initialized');
9 |
10 | // All functionality is loaded from other modules:
11 | // - charts.js: Chart configurations and updates
12 | // - gpu-cards.js: GPU card rendering and updates
13 | // - ui.js: UI interactions and navigation
14 | // - socket-handlers.js: Real-time data updates via Socket.IO
15 |
16 | // The socket connection is established automatically when socket-handlers.js loads
17 |
18 | // Check for version updates
19 | checkVersion();
20 | });
21 |
22 | /**
23 | * Check current version and update availability
24 | */
25 | async function checkVersion() {
26 | try {
27 | const response = await fetch('/api/version');
28 | const data = await response.json();
29 |
30 | const versionCurrent = document.getElementById('version-current');
31 | const updateBadge = document.getElementById('update-badge');
32 | const updateLink = document.getElementById('update-link');
33 |
34 | if (versionCurrent) {
35 | versionCurrent.textContent = `v${data.current}`;
36 | }
37 |
38 | if (data.update_available && data.latest) {
39 | updateBadge.style.display = 'inline-block';
40 | updateLink.href = data.release_url || 'https://github.com/psalias2006/gpu-hot/releases/latest';
41 | updateLink.title = `Update to v${data.latest}`;
42 | }
43 | } catch (error) {
44 | console.debug('Failed to check version:', error);
45 | const versionCurrent = document.getElementById('version-current');
46 | if (versionCurrent) {
47 | versionCurrent.textContent = 'Unknown';
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Build and Push Docker Image
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | tags:
8 | - 'v*'
9 | workflow_dispatch:
10 |
11 | env:
12 | REGISTRY: ghcr.io
13 | IMAGE_NAME: ${{ github.repository }}
14 |
15 | jobs:
16 | build-and-push:
17 | runs-on: ubuntu-latest
18 | permissions:
19 | contents: read
20 | packages: write
21 | id-token: write
22 | attestations: write
23 |
24 | steps:
25 | - name: Checkout repository
26 | uses: actions/checkout@v4
27 |
28 | - name: Set up Docker Buildx
29 | uses: docker/setup-buildx-action@v3
30 |
31 | - name: Log in to ghcr
32 | if: github.event_name != 'pull_request'
33 | uses: docker/login-action@v3
34 | with:
35 | registry: ${{ env.REGISTRY }}
36 | username: ${{ github.actor }}
37 | password: ${{ secrets.GITHUB_TOKEN }}
38 |
39 | - name: Extract metadata
40 | id: meta
41 | uses: docker/metadata-action@v5
42 | with:
43 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
44 | tags: |
45 | type=ref,event=branch
46 | type=semver,pattern={{version}}
47 | type=raw,value=latest,enable={{is_default_branch}}
48 |
49 | - name: Build and Push Docker image
50 | id: build
51 | uses: docker/build-push-action@v5
52 | with:
53 | context: .
54 | platforms: linux/amd64,linux/arm64
55 | push: ${{ github.event_name != 'pull_request' }}
56 | tags: ${{ steps.meta.outputs.tags }}
57 | labels: ${{ steps.meta.outputs.labels }}
58 | cache-from: type=gha
59 | cache-to: type=gha,mode=max
60 | build-args: |
61 | BUILDKIT_INLINE_CACHE=1
62 |
63 | - name: Generate artifact
64 | if: github.event_name != 'pull_request'
65 | uses: actions/attest-build-provenance@v1
66 | with:
67 | subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
68 | subject-digest: ${{ steps.build.outputs.digest }}
69 | push-to-registry: true
70 |
--------------------------------------------------------------------------------
/tests/docker-compose.test.yml:
--------------------------------------------------------------------------------
1 | services:
2 | # Mock GPU cluster (simulates multiple GPU servers)
3 | mock-cluster:
4 | build:
5 | context: ..
6 | dockerfile: tests/Dockerfile.test
7 | container_name: gpu-hot-mock-cluster
8 | hostname: mock-cluster
9 | ports:
10 | - "13120-13150:13120-13150"
11 | environment:
12 | # LOAD TEST PRESETS - uncomment one:
13 | # LIGHT: 3 nodes, 5 GPUs (typical small lab)
14 | - NODES=1,2,2
15 | # MEDIUM: 8 nodes, 64 GPUs (medium cluster)
16 | #- NODES=8,8,8,8,8,8,8,8
17 | # HEAVY: 20 nodes, 160 GPUs (large production)
18 | # - NODES=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
19 |
20 | - BASE_PORT=13120
21 | - PREFIX=gpu-server
22 | networks:
23 | gpu-hot-test:
24 | aliases:
25 | - mock-cluster
26 |
27 | # Hub (aggregates all mock nodes)
28 | hub:
29 | build:
30 | context: ..
31 | dockerfile: Dockerfile
32 | container_name: gpu-hot-hub
33 | ports:
34 | - "1312:1312"
35 | environment:
36 | - GPU_HOT_MODE=hub
37 | # Must match NODES count above:
38 | # LIGHT (3):
39 | - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122
40 | # MEDIUM (8):
41 | #- NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122,http://mock-cluster:13123,http://mock-cluster:13124,http://mock-cluster:13125,http://mock-cluster:13126,http://mock-cluster:13127
42 | # HEAVY (20):
43 | # - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122,http://mock-cluster:13123,http://mock-cluster:13124,http://mock-cluster:13125,http://mock-cluster:13126,http://mock-cluster:13127,http://mock-cluster:13128,http://mock-cluster:13129,http://mock-cluster:13130,http://mock-cluster:13131,http://mock-cluster:13132,http://mock-cluster:13133,http://mock-cluster:13134,http://mock-cluster:13135,http://mock-cluster:13136,http://mock-cluster:13137,http://mock-cluster:13138,http://mock-cluster:13139
44 | depends_on:
45 | mock-cluster:
46 | condition: service_started
47 | networks:
48 | - gpu-hot-test
49 |
50 | networks:
51 | gpu-hot-test:
52 | driver: bridge
53 |
54 |
--------------------------------------------------------------------------------
/core/hub_handlers.py:
--------------------------------------------------------------------------------
1 | """Async WebSocket handlers for hub mode"""
2 |
3 | import asyncio
4 | import logging
5 | import json
6 | from fastapi import WebSocket
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | # Global WebSocket connections
11 | websocket_connections = set()
12 |
13 | def register_hub_handlers(app, hub):
14 | """Register FastAPI WebSocket handlers for hub mode"""
15 |
16 | @app.websocket("/socket.io/")
17 | async def websocket_endpoint(websocket: WebSocket):
18 | await websocket.accept()
19 | websocket_connections.add(websocket)
20 | logger.debug('Dashboard client connected')
21 |
22 | if not hub.running:
23 | hub.running = True
24 | asyncio.create_task(hub_loop(hub, websocket_connections))
25 |
26 | # Start node connections if not already started
27 | if not hub._connection_started:
28 | hub._connection_started = True
29 | asyncio.create_task(hub._connect_all_nodes())
30 |
31 | try:
32 | # Keep connection alive
33 | while True:
34 | await websocket.receive_text()
35 | except Exception as e:
36 | logger.debug(f'Dashboard client disconnected: {e}')
37 | finally:
38 | websocket_connections.discard(websocket)
39 |
40 |
41 | async def hub_loop(hub, connections):
42 | """Async background loop that emits aggregated cluster data"""
43 | logger.info("Hub monitoring loop started")
44 |
45 | while hub.running:
46 | try:
47 | cluster_data = await hub.get_cluster_data()
48 |
49 | # Send to all connected clients
50 | if connections:
51 | disconnected = set()
52 | for websocket in connections:
53 | try:
54 | await websocket.send_text(json.dumps(cluster_data))
55 | except:
56 | disconnected.add(websocket)
57 |
58 | # Remove disconnected clients
59 | connections -= disconnected
60 |
61 | except Exception as e:
62 | logger.error(f"Error in hub loop: {e}")
63 |
64 | # Match node update rate for real-time responsiveness
65 | await asyncio.sleep(0.5)
66 |
67 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # GPU Hot - Load Testing (FastAPI + AsyncIO)
2 |
3 | Simple load testing for multi-node GPU monitoring with realistic async patterns.
4 |
5 | ## Quick Start
6 |
7 | ```bash
8 | cd tests
9 | docker-compose -f docker-compose.test.yml up
10 | ```
11 |
12 | Open http://localhost:1312 to see the dashboard.
13 |
14 | ## Architecture
15 |
16 | - **FastAPI + AsyncIO**: Modern async Python for better performance
17 | - **Native WebSockets**: No Socket.IO overhead, direct WebSocket protocol
18 | - **Concurrent Mock Nodes**: Multiple nodes running in parallel
19 | - **Realistic GPU Patterns**: Training jobs with epochs, warmup, validation
20 |
21 | ## Load Test Presets
22 |
23 | Edit `docker-compose.test.yml` and uncomment the preset you want:
24 |
25 | ### LIGHT (3 nodes, 14 GPUs)
26 | Good for development and quick testing.
27 | ```yaml
28 | - NODES=2,4,8
29 | - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122
30 | ```
31 |
32 | ### MEDIUM (8 nodes, 64 GPUs) ⭐ Default
33 | Realistic medium-sized cluster.
34 | ```yaml
35 | - NODES=8,8,8,8,8,8,8,8
36 | - NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13127
37 | ```
38 |
39 | ### HEAVY (20 nodes, 160 GPUs)
40 | Stress test for large production environments.
41 | ```yaml
42 | - NODES=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
43 | - NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13139
44 | ```
45 |
46 | ## What's Simulated
47 |
48 | - **Realistic GPU patterns**: Training jobs with epochs, warmup, validation
49 | - **Idle + busy GPUs**: ~40% utilization typical of real clusters
50 | - **Stable memory**: Memory allocated at job start, stays constant
51 | - **Clock speeds**: Proper P-states (P0/P2/P8)
52 | - **Data loading dips**: Periodic utilization drops
53 | - **Temperature correlation**: Realistic thermal behavior
54 |
55 | ## Files
56 |
57 | - `test_cluster.py` - Mock GPU node with realistic patterns (FastAPI + AsyncIO)
58 | - `docker-compose.test.yml` - Test stack with preset configurations
59 | - `Dockerfile.test` - Container for mock nodes (FastAPI dependencies)
60 |
61 | ## Performance Benefits
62 |
63 | - **20-40% latency reduction** with true async/await
64 | - **2-3x more concurrent connections** supported
65 | - **Better resource utilization** for hub mode aggregation
66 | - **Sub-500ms latency** consistently achieved
67 |
68 | ## Rebuild After Changes
69 |
70 | ```bash
71 | docker-compose -f docker-compose.test.yml down
72 | docker-compose -f docker-compose.test.yml up --build
73 | ```
74 |
--------------------------------------------------------------------------------
/core/handlers.py:
--------------------------------------------------------------------------------
1 | """Async WebSocket handlers for real-time monitoring"""
2 |
3 | import asyncio
4 | import psutil
5 | import logging
6 | import json
7 | from datetime import datetime
8 | from fastapi import WebSocket
9 | from . import config
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 | # Global WebSocket connections
14 | websocket_connections = set()
15 |
16 | def register_handlers(app, monitor):
17 | """Register FastAPI WebSocket handlers"""
18 |
19 | @app.websocket("/socket.io/")
20 | async def websocket_endpoint(websocket: WebSocket):
21 | await websocket.accept()
22 | websocket_connections.add(websocket)
23 | logger.debug('Dashboard client connected')
24 |
25 | if not monitor.running:
26 | monitor.running = True
27 | asyncio.create_task(monitor_loop(monitor, websocket_connections))
28 |
29 | try:
30 | # Keep connection alive
31 | while True:
32 | await websocket.receive_text()
33 | except Exception as e:
34 | logger.debug(f'Dashboard client disconnected: {e}')
35 | finally:
36 | websocket_connections.discard(websocket)
37 |
38 |
39 | async def monitor_loop(monitor, connections):
40 | """Async background loop that collects and emits GPU data"""
41 | # Determine update interval based on whether any GPU uses nvidia-smi
42 | uses_nvidia_smi = any(monitor.use_smi.values()) if hasattr(monitor, 'use_smi') else False
43 | update_interval = config.NVIDIA_SMI_INTERVAL if uses_nvidia_smi else config.UPDATE_INTERVAL
44 |
45 | if uses_nvidia_smi:
46 | logger.info(f"Using nvidia-smi polling interval: {update_interval}s")
47 | else:
48 | logger.info(f"Using NVML polling interval: {update_interval}s")
49 |
50 | while monitor.running:
51 | try:
52 | # Collect data concurrently
53 | gpu_data, processes = await asyncio.gather(
54 | monitor.get_gpu_data(),
55 | monitor.get_processes()
56 | )
57 |
58 | system_info = {
59 | 'cpu_percent': psutil.cpu_percent(percpu=False),
60 | 'memory_percent': psutil.virtual_memory().percent,
61 | 'timestamp': datetime.now().isoformat()
62 | }
63 |
64 | data = {
65 | 'mode': config.MODE,
66 | 'node_name': config.NODE_NAME,
67 | 'gpus': gpu_data,
68 | 'processes': processes,
69 | 'system': system_info
70 | }
71 |
72 | # Send to all connected clients
73 | if connections:
74 | disconnected = set()
75 | for websocket in connections:
76 | try:
77 | await websocket.send_text(json.dumps(data))
78 | except:
79 | disconnected.add(websocket)
80 |
81 | # Remove disconnected clients
82 | connections -= disconnected
83 |
84 | except Exception as e:
85 | logger.error(f"Error in monitor loop: {e}")
86 |
87 | await asyncio.sleep(update_interval)
88 |
89 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # GPU Hot
4 |
5 | Real-time NVIDIA GPU monitoring dashboard. Web-based, no SSH required.
6 |
7 | [](https://www.python.org/)
8 | [](https://www.docker.com/)
9 | [](LICENSE)
10 | [](https://www.nvidia.com/)
11 |
12 |

13 |
14 |
15 |
16 | ---
17 |
18 | ## Usage
19 |
20 | Monitor a single machine or an entire cluster with the same Docker image.
21 |
22 | **Single machine:**
23 | ```bash
24 | docker run -d --gpus all -p 1312:1312 ghcr.io/psalias2006/gpu-hot:latest
25 | ```
26 |
27 | **Multiple machines:**
28 | ```bash
29 | # On each GPU server
30 | docker run -d --gpus all -p 1312:1312 -e NODE_NAME=$(hostname) ghcr.io/psalias2006/gpu-hot:latest
31 |
32 | # On a hub machine (no GPU required)
33 | docker run -d -p 1312:1312 -e GPU_HOT_MODE=hub -e NODE_URLS=http://server1:1312,http://server2:1312,http://server3:1312 ghcr.io/psalias2006/gpu-hot:latest
34 | ```
35 |
36 | Open `http://localhost:1312`
37 |
38 | **Older GPUs:** Add `-e NVIDIA_SMI=true` if metrics don't appear.
39 |
40 | **Process monitoring:** Add `--init --pid=host` to see process names. Note: This allows the container to access host process information.
41 |
42 | **From source:**
43 | ```bash
44 | git clone https://github.com/psalias2006/gpu-hot
45 | cd gpu-hot
46 | docker-compose up --build
47 | ```
48 |
49 | **Requirements:** Docker + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
50 |
51 | ---
52 |
53 | ## Features
54 |
55 | - Real-time metrics (sub-second)
56 | - Automatic multi-GPU detection
57 | - Process monitoring (PID, memory usage)
58 | - Historical charts (utilization, temperature, power, clocks)
59 | - System metrics (CPU, RAM)
60 | - Scale from 1 to 100+ GPUs
61 |
62 | **Metrics:** Utilization, temperature, memory, power draw, fan speed, clock speeds, PCIe info, P-State, throttle status, encoder/decoder sessions
63 |
64 | ---
65 |
66 | ## Configuration
67 |
68 | **Environment variables:**
69 | ```bash
70 | NVIDIA_VISIBLE_DEVICES=0,1 # Specific GPUs (default: all)
71 | NVIDIA_SMI=true # Force nvidia-smi mode for older GPUs
72 | GPU_HOT_MODE=hub # Set to 'hub' for multi-node aggregation (default: single node)
73 | NODE_NAME=gpu-server-1 # Node display name (default: hostname)
74 | NODE_URLS=http://host:1312... # Comma-separated node URLs (required for hub mode)
75 | ```
76 |
77 | **Backend (`core/config.py`):**
78 | ```python
79 | UPDATE_INTERVAL = 0.5 # Polling interval
80 | PORT = 1312 # Server port
81 | ```
82 |
83 | ---
84 |
85 | ## API
86 |
87 | ### HTTP
88 | ```bash
89 | GET / # Dashboard
90 | GET /api/gpu-data # JSON metrics
91 | ```
92 |
93 | ### WebSocket
94 | ```javascript
95 | socket.on('gpu_data', (data) => {
96 | // Updates every 0.5s (configurable)
97 | // Contains: data.gpus, data.processes, data.system
98 | });
99 | ```
100 | ---
101 |
102 | ## Project Structure
103 |
104 | ```bash
105 | gpu-hot/
106 | ├── app.py # Flask + WebSocket server
107 | ├── core/
108 | │ ├── config.py # Configuration
109 | │ ├── monitor.py # NVML GPU monitoring
110 | │ ├── handlers.py # WebSocket handlers
111 | │ ├── routes.py # HTTP routes
112 | │ └── metrics/
113 | │ ├── collector.py # Metrics collection
114 | │ └── utils.py # Metric utilities
115 | ├── static/
116 | │ ├── js/
117 | │ │ ├── charts.js # Chart configs
118 | │ │ ├── gpu-cards.js # UI components
119 | │ │ ├── socket-handlers.js # WebSocket + rendering
120 | │ │ ├── ui.js # View management
121 | │ │ └── app.js # Init
122 | │ └── css/styles.css
123 | ├── templates/index.html
124 | ├── Dockerfile
125 | └── docker-compose.yml
126 | ```
127 |
128 | ---
129 |
130 | ## Troubleshooting
131 |
132 | **No GPUs detected:**
133 | ```bash
134 | nvidia-smi # Verify drivers work
135 | docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi # Test Docker GPU access
136 | ```
137 |
138 | **Hub can't connect to nodes:**
139 | ```bash
140 | curl http://node-ip:1312/api/gpu-data # Test connectivity
141 | sudo ufw allow 1312/tcp # Check firewall
142 | ```
143 |
144 | **Performance issues:** Increase `UPDATE_INTERVAL` in `core/config.py`
145 |
146 | ---
147 |
148 | ## Star History
149 |
150 | [](https://www.star-history.com/#psalias2006/gpu-hot&type=date&legend=top-left)
151 |
152 | ## Contributing
153 |
154 | PRs welcome. Open an issue for major changes.
155 |
156 | ## License
157 |
158 | MIT - see [LICENSE](LICENSE)
159 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """GPU Hot - Real-time NVIDIA GPU Monitoring Dashboard (FastAPI + AsyncIO)"""
3 |
4 | import asyncio
5 | import logging
6 | import aiohttp
7 | from fastapi import FastAPI, WebSocket, WebSocketDisconnect
8 | from fastapi.staticfiles import StaticFiles
9 | from fastapi.responses import HTMLResponse, JSONResponse
10 | from core import config
11 | from version import __version__
12 |
13 | # Setup logging
14 | logging.basicConfig(
15 | level=logging.DEBUG if config.DEBUG else logging.INFO,
16 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17 | )
18 | logger = logging.getLogger(__name__)
19 |
20 | app = FastAPI(title="GPU Hot", version=__version__)
21 |
22 | # Serve static files
23 | app.mount("/static", StaticFiles(directory="static"), name="static")
24 |
25 | # Mode selection
26 | if config.MODE == 'hub':
27 | # Hub mode: aggregate data from multiple nodes
28 | if not config.NODE_URLS:
29 | raise ValueError("Hub mode requires NODE_URLS environment variable")
30 |
31 | logger.info("Starting GPU Hot in HUB mode (FastAPI)")
32 | logger.info(f"Connecting to {len(config.NODE_URLS)} node(s): {config.NODE_URLS}")
33 |
34 | from core.hub import Hub
35 | from core.hub_handlers import register_hub_handlers
36 |
37 | hub = Hub(config.NODE_URLS)
38 | register_hub_handlers(app, hub)
39 | monitor_or_hub = hub
40 |
41 | else:
42 | # Default mode: monitor local GPUs and serve dashboard
43 | logger.info("Starting GPU Hot (FastAPI)")
44 | logger.info(f"Node name: {config.NODE_NAME}")
45 |
46 | from core.monitor import GPUMonitor
47 | from core.handlers import register_handlers
48 |
49 | monitor = GPUMonitor()
50 | register_handlers(app, monitor)
51 | monitor_or_hub = monitor
52 |
53 |
54 | @app.get("/")
55 | async def index():
56 | """Serve the main dashboard"""
57 | with open("templates/index.html", "r") as f:
58 | return HTMLResponse(content=f.read())
59 |
60 |
61 | @app.get("/api/gpu-data")
62 | async def api_gpu_data():
63 | """REST API endpoint for GPU data"""
64 | if config.MODE == 'hub':
65 | return {"gpus": {}, "timestamp": "hub_mode"}
66 |
67 | if hasattr(monitor_or_hub, 'get_gpu_data'):
68 | return {"gpus": await monitor_or_hub.get_gpu_data(), "timestamp": "async"}
69 |
70 | return {"gpus": {}, "timestamp": "no_data"}
71 |
72 |
73 | def compare_versions(current, latest):
74 | """Compare semantic versions. Returns True if latest > current"""
75 | try:
76 | current_parts = [int(x) for x in current.split('.')]
77 | latest_parts = [int(x) for x in latest.split('.')]
78 |
79 | # Pad to same length
80 | max_len = max(len(current_parts), len(latest_parts))
81 | current_parts += [0] * (max_len - len(current_parts))
82 | latest_parts += [0] * (max_len - len(latest_parts))
83 |
84 | # Compare each part
85 | for c, l in zip(current_parts, latest_parts):
86 | if l > c:
87 | return True
88 | elif l < c:
89 | return False
90 |
91 | return False # Versions are equal
92 | except (ValueError, AttributeError):
93 | return False
94 |
95 |
96 | @app.get("/api/version")
97 | async def api_version():
98 | """Get current version and check for updates from GitHub"""
99 | current_version = __version__
100 |
101 | try:
102 | # Check GitHub for latest release
103 | async with aiohttp.ClientSession() as session:
104 | async with session.get(
105 | "https://api.github.com/repos/psalias2006/gpu-hot/releases/latest",
106 | timeout=aiohttp.ClientTimeout(total=5)
107 | ) as response:
108 | if response.status == 200:
109 | data = await response.json()
110 | latest_version = data.get("tag_name", "").lstrip("v")
111 |
112 | # Only show update if latest > current
113 | update_available = compare_versions(current_version, latest_version) if latest_version else False
114 |
115 | return JSONResponse({
116 | "current": current_version,
117 | "latest": latest_version,
118 | "update_available": update_available,
119 | "release_url": data.get("html_url", "")
120 | })
121 | except Exception as e:
122 | logger.debug(f"Failed to check for updates: {e}")
123 |
124 | # Return current version even if GitHub check fails
125 | return JSONResponse({
126 | "current": current_version,
127 | "latest": None,
128 | "update_available": False,
129 | "release_url": None
130 | })
131 |
132 |
133 | if __name__ == '__main__':
134 | import uvicorn
135 | try:
136 | logger.info(f"Server running on {config.HOST}:{config.PORT}")
137 | uvicorn.run(app, host=config.HOST, port=config.PORT, log_level="info")
138 | finally:
139 | if hasattr(monitor_or_hub, 'shutdown'):
140 | asyncio.run(monitor_or_hub.shutdown())
141 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | GPU Hot
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
36 |
37 |
38 |
39 |
40 |
Live Monitoring
41 |
42 |
Connecting...
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | Loading GPU data...
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
72 |
73 |
74 |
75 |
76 | Loading processes...
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
0%
86 |
System CPU
87 |
Host Processor
88 |
89 |
90 |
91 |
0%
92 |
System RAM
93 |
Host Memory
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/static/js/ui.js:
--------------------------------------------------------------------------------
1 | /**
2 | * UI Interactions and navigation
3 | */
4 |
5 | // Global state
6 | let currentTab = 'overview';
7 | let registeredGPUs = new Set();
8 | let hasAutoSwitched = false; // Track if we've done initial auto-switch
9 |
10 | // Toggle processes section
11 | function toggleProcesses() {
12 | const content = document.getElementById('processes-content');
13 | const header = document.querySelector('.processes-header');
14 | const icon = document.querySelector('.toggle-icon');
15 |
16 | content.classList.toggle('expanded');
17 | header.classList.toggle('expanded');
18 | icon.classList.toggle('expanded');
19 | }
20 |
21 | // Tab switching with smooth transitions
22 | function switchToView(viewName) {
23 | if (!viewName) {
24 | console.warn('switchToView: Missing viewName');
25 | return;
26 | }
27 |
28 | currentTab = viewName;
29 |
30 | // Update view selector states
31 | document.querySelectorAll('.view-option').forEach(btn => {
32 | btn.classList.remove('active');
33 | if (btn.dataset.view === viewName) {
34 | btn.classList.add('active');
35 | }
36 | });
37 |
38 | // Switch tab content with animation
39 | document.querySelectorAll('.tab-content').forEach(content => {
40 | content.classList.remove('active');
41 | });
42 |
43 | const targetContent = document.getElementById(`tab-${viewName}`);
44 | if (!targetContent) {
45 | console.warn(`switchToView: Tab content not found for "${viewName}"`);
46 | return;
47 | }
48 |
49 | targetContent.classList.add('active');
50 |
51 | // Trigger chart resize for visible charts immediately without animation
52 | if (viewName.startsWith('gpu-')) {
53 | const gpuId = viewName.replace('gpu-', '');
54 |
55 | // Disable animations during resize to prevent glitchy transitions
56 | if (charts && charts[gpuId]) {
57 | Object.values(charts[gpuId]).forEach(chart => {
58 | if (!chart) return;
59 |
60 | try {
61 | if (chart.options) {
62 | // Store original animation setting
63 | const originalAnimation = chart.options.animation;
64 |
65 | // Temporarily disable all animations
66 | chart.options.animation = false;
67 |
68 | // Resize without animation
69 | if (typeof chart.resize === 'function') {
70 | chart.resize();
71 | }
72 |
73 | // Force immediate update without animation
74 | if (typeof chart.update === 'function') {
75 | chart.update('none');
76 | }
77 |
78 | // Restore original animation setting
79 | chart.options.animation = originalAnimation;
80 | }
81 | } catch (error) {
82 | console.error(`Error resizing chart for GPU ${gpuId}:`, error);
83 | }
84 | });
85 | }
86 | }
87 | }
88 |
89 | // Create or update GPU tab
90 | function ensureGPUTab(gpuId, gpuInfo, shouldUpdateDOM = true) {
91 | if (!registeredGPUs.has(gpuId)) {
92 | // Add view option
93 | const viewSelector = document.getElementById('view-selector');
94 | const viewOption = document.createElement('button');
95 | viewOption.className = 'view-option';
96 | viewOption.dataset.view = `gpu-${gpuId}`;
97 | viewOption.textContent = `GPU ${gpuId}`;
98 | viewOption.onclick = () => switchToView(`gpu-${gpuId}`);
99 | viewSelector.appendChild(viewOption);
100 |
101 | // Create tab content
102 | const tabContent = document.createElement('div');
103 | tabContent.id = `tab-gpu-${gpuId}`;
104 | tabContent.className = 'tab-content';
105 | tabContent.innerHTML = ``;
106 | document.getElementById('tab-overview').after(tabContent);
107 |
108 | registeredGPUs.add(gpuId);
109 | }
110 |
111 | // Update or create detailed GPU card in tab
112 | const detailedContainer = document.querySelector(`#tab-gpu-${gpuId} .detailed-view`);
113 | const existingCard = document.getElementById(`gpu-${gpuId}`);
114 |
115 | if (!existingCard && detailedContainer) {
116 | detailedContainer.innerHTML = createGPUCard(gpuId, gpuInfo);
117 | // Do not reinitialize chartData here; it would break existing chart references
118 | if (!chartData[gpuId]) initGPUData(gpuId);
119 | initGPUCharts(gpuId);
120 | } else if (existingCard) {
121 | updateGPUDisplay(gpuId, gpuInfo, shouldUpdateDOM);
122 | }
123 | }
124 |
125 | // Remove GPU tab
126 | function removeGPUTab(gpuId) {
127 | if (!registeredGPUs.has(gpuId)) {
128 | return; // Tab doesn't exist
129 | }
130 |
131 | // If currently viewing this GPU's tab, switch to overview
132 | if (currentTab === `gpu-${gpuId}`) {
133 | switchToView('overview');
134 | }
135 |
136 | // Remove view option button
137 | const viewOption = document.querySelector(`.view-option[data-view="gpu-${gpuId}"]`);
138 | if (viewOption) {
139 | viewOption.remove();
140 | }
141 |
142 | // Remove tab content
143 | const tabContent = document.getElementById(`tab-gpu-${gpuId}`);
144 | if (tabContent) {
145 | tabContent.remove();
146 | }
147 |
148 | // Destroy charts
149 | if (charts[gpuId]) {
150 | Object.values(charts[gpuId]).forEach(chart => {
151 | if (chart && chart.destroy) {
152 | chart.destroy();
153 | }
154 | });
155 | delete charts[gpuId];
156 | }
157 |
158 | // Remove from registered GPUs
159 | registeredGPUs.delete(gpuId);
160 | }
161 |
162 | // Auto-switch to single GPU view if only 1 GPU detected
163 | function autoSwitchSingleGPU(gpuCount, gpuIds) {
164 | if (gpuCount === 1 && !hasAutoSwitched) {
165 | const singleGpuId = gpuIds[0];
166 | setTimeout(() => {
167 | switchToView(`gpu-${singleGpuId}`);
168 | }, 300); // Small delay to ensure DOM is ready
169 | hasAutoSwitched = true;
170 | }
171 | }
172 |
173 | // Make switchToView globally available
174 | window.switchToView = switchToView;
175 |
--------------------------------------------------------------------------------
/core/hub.py:
--------------------------------------------------------------------------------
1 | """Async Hub mode - aggregates data from multiple nodes"""
2 |
3 | import asyncio
4 | import logging
5 | import json
6 | import websockets
7 | from datetime import datetime
8 | from . import config
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class Hub:
14 | """Aggregates GPU data from multiple nodes"""
15 |
16 | def __init__(self, node_urls):
17 | self.node_urls = node_urls
18 | self.nodes = {} # node_name -> {client, data, status, last_update}
19 | self.url_to_node = {} # url -> node_name mapping
20 | self.running = False
21 | self._connection_started = False
22 |
23 | # Initialize nodes as offline
24 | for url in node_urls:
25 | self.nodes[url] = {
26 | 'url': url,
27 | 'websocket': None,
28 | 'data': None,
29 | 'status': 'offline',
30 | 'last_update': None
31 | }
32 | self.url_to_node[url] = url
33 |
34 | async def _connect_all_nodes(self):
35 | """Connect to all nodes in background with retries"""
36 | # Wait a bit for Docker network to be ready
37 | await asyncio.sleep(2)
38 |
39 | # Connect to all nodes concurrently
40 | tasks = [self._connect_node_with_retry(url) for url in self.node_urls]
41 | await asyncio.gather(*tasks, return_exceptions=True)
42 |
43 | async def _connect_node_with_retry(self, url):
44 | """Connect to a node with retry logic"""
45 | max_retries = 5
46 | retry_delay = 2
47 |
48 | for attempt in range(max_retries):
49 | try:
50 | await self._connect_node(url)
51 | return # Success
52 | except Exception as e:
53 | if attempt < max_retries - 1:
54 | logger.warning(f'Connection attempt {attempt + 1}/{max_retries} failed for {url}: {str(e)}, retrying in {retry_delay}s...')
55 | await asyncio.sleep(retry_delay)
56 | else:
57 | logger.error(f'Failed to connect to node {url} after {max_retries} attempts: {str(e)}')
58 |
59 | async def _connect_node(self, url):
60 | """Connect to a node using native WebSocket"""
61 | while self.running:
62 | try:
63 | # Convert HTTP URL to WebSocket URL
64 | ws_url = url.replace('http://', 'ws://').replace('https://', 'wss://') + '/socket.io/'
65 |
66 | logger.info(f'Connecting to node WebSocket: {ws_url}')
67 |
68 | async with websockets.connect(ws_url) as websocket:
69 | logger.info(f'Connected to node: {url}')
70 |
71 | # Mark node as online
72 | node_name = self.url_to_node.get(url, url)
73 | self.nodes[node_name] = {
74 | 'url': url,
75 | 'websocket': websocket,
76 | 'data': None,
77 | 'status': 'online',
78 | 'last_update': datetime.now().isoformat()
79 | }
80 |
81 | # Listen for data from the node
82 | async for message in websocket:
83 | try:
84 | data = json.loads(message)
85 |
86 | # Extract node name from data or use URL as fallback
87 | node_name = data.get('node_name', url)
88 |
89 | # Update URL to node mapping
90 | self.url_to_node[url] = node_name
91 |
92 | # Update node entry with received data
93 | self.nodes[node_name] = {
94 | 'url': url,
95 | 'websocket': websocket,
96 | 'data': data,
97 | 'status': 'online',
98 | 'last_update': datetime.now().isoformat()
99 | }
100 |
101 | except json.JSONDecodeError as e:
102 | logger.error(f'Failed to parse message from {url}: {e}')
103 | except Exception as e:
104 | logger.error(f'Error processing message from {url}: {e}')
105 |
106 | except websockets.exceptions.ConnectionClosed:
107 | logger.warning(f'WebSocket connection closed for node: {url}')
108 | # Mark node as offline
109 | node_name = self.url_to_node.get(url, url)
110 | if node_name in self.nodes:
111 | self.nodes[node_name]['status'] = 'offline'
112 | logger.info(f'Marked node {node_name} as offline')
113 | except Exception as e:
114 | logger.error(f'Failed to connect to node {url}: {e}')
115 | # Mark node as offline
116 | node_name = self.url_to_node.get(url, url)
117 | if node_name in self.nodes:
118 | self.nodes[node_name]['status'] = 'offline'
119 | logger.info(f'Marked node {node_name} as offline')
120 |
121 | # Wait before retrying connection
122 | if self.running:
123 | await asyncio.sleep(5)
124 |
125 | async def get_cluster_data(self):
126 | """Get aggregated data from all nodes"""
127 | nodes = {}
128 | total_gpus = 0
129 | online_nodes = 0
130 |
131 | for node_name, node_info in self.nodes.items():
132 | if node_info['status'] == 'online' and node_info['data']:
133 | nodes[node_name] = {
134 | 'status': 'online',
135 | 'gpus': node_info['data'].get('gpus', {}),
136 | 'processes': node_info['data'].get('processes', []),
137 | 'system': node_info['data'].get('system', {}),
138 | 'last_update': node_info['last_update']
139 | }
140 | total_gpus += len(node_info['data'].get('gpus', {}))
141 | online_nodes += 1
142 | else:
143 | nodes[node_name] = {
144 | 'status': 'offline',
145 | 'gpus': {},
146 | 'processes': [],
147 | 'system': {},
148 | 'last_update': node_info.get('last_update')
149 | }
150 |
151 | return {
152 | 'mode': 'hub',
153 | 'nodes': nodes,
154 | 'cluster_stats': {
155 | 'total_nodes': len(self.nodes),
156 | 'online_nodes': online_nodes,
157 | 'total_gpus': total_gpus
158 | }
159 | }
160 |
161 | async def shutdown(self):
162 | """Disconnect from all nodes"""
163 | self.running = False
164 | for node_info in self.nodes.values():
165 | if node_info.get('websocket'):
166 | try:
167 | await node_info['websocket'].close()
168 | except:
169 | pass
170 |
171 |
--------------------------------------------------------------------------------
/core/nvidia_smi_fallback.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple nvidia-smi fallback parser
3 | Based on the original working implementation
4 | """
5 |
6 | import subprocess
7 | import logging
8 | from datetime import datetime
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def parse_nvidia_smi():
14 | """Parse nvidia-smi output and extract comprehensive GPU information"""
15 | try:
16 | result = subprocess.run([
17 | 'nvidia-smi',
18 | '--query-gpu=index,name,uuid,driver_version,vbios_version,'
19 | 'temperature.gpu,utilization.gpu,utilization.memory,'
20 | 'memory.used,memory.total,memory.free,power.draw,power.limit,'
21 | 'fan.speed,clocks.gr,clocks.sm,clocks.mem,'
22 | 'clocks.max.gr,clocks.max.sm,clocks.max.mem,'
23 | 'pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max,'
24 | 'encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,'
25 | 'pstate,compute_mode',
26 | '--format=csv,noheader,nounits'
27 | ], capture_output=True, text=True, timeout=10)
28 |
29 | if result.returncode != 0:
30 | logger.warning(f"nvidia-smi comprehensive query failed (code {result.returncode}), trying basic query")
31 | return parse_nvidia_smi_fallback()
32 |
33 | lines = result.stdout.strip().split('\n')
34 | gpu_data = {}
35 |
36 | for line in lines:
37 | if line.strip():
38 | parts = [p.strip() for p in line.split(',')]
39 | if len(parts) >= 27:
40 | gpu_id = parts[0]
41 | gpu_data[gpu_id] = {
42 | 'index': parts[0],
43 | 'name': parts[1],
44 | 'uuid': parts[2] if parts[2] not in ['N/A', '[N/A]', ''] else 'N/A',
45 | 'driver_version': parts[3] if parts[3] not in ['N/A', '[N/A]', ''] else 'N/A',
46 | 'vbios_version': parts[4] if parts[4] not in ['N/A', '[N/A]', ''] else 'N/A',
47 | 'temperature': float(parts[5]) if parts[5] not in ['N/A', '[N/A]', ''] else 0,
48 | 'temperature_memory': 0,
49 | 'utilization': float(parts[6]) if parts[6] not in ['N/A', '[N/A]', ''] else 0,
50 | 'memory_utilization': float(parts[7]) if parts[7] not in ['N/A', '[N/A]', ''] else 0,
51 | 'memory_used': float(parts[8]) if parts[8] not in ['N/A', '[N/A]', ''] else 0,
52 | 'memory_total': float(parts[9]) if parts[9] not in ['N/A', '[N/A]', ''] else 0,
53 | 'memory_free': float(parts[10]) if parts[10] not in ['N/A', '[N/A]', ''] else 0,
54 | 'power_draw': float(parts[11]) if parts[11] not in ['N/A', '[N/A]', ''] else 0,
55 | 'power_limit': float(parts[12]) if parts[12] not in ['N/A', '[N/A]', ''] else 0,
56 | 'power_default_limit': 0,
57 | 'fan_speed': float(parts[13]) if parts[13] not in ['N/A', '[N/A]', ''] else 0,
58 | 'clock_graphics': float(parts[14]) if parts[14] not in ['N/A', '[N/A]', ''] else 0,
59 | 'clock_sm': float(parts[15]) if parts[15] not in ['N/A', '[N/A]', ''] else 0,
60 | 'clock_memory': float(parts[16]) if parts[16] not in ['N/A', '[N/A]', ''] else 0,
61 | 'clock_video': 0,
62 | 'clock_max_graphics': float(parts[17]) if parts[17] not in ['N/A', '[N/A]', ''] else 0,
63 | 'clock_max_sm': float(parts[18]) if parts[18] not in ['N/A', '[N/A]', ''] else 0,
64 | 'clock_max_memory': float(parts[19]) if parts[19] not in ['N/A', '[N/A]', ''] else 0,
65 | 'pcie_gen': parts[20] if parts[20] not in ['N/A', '[N/A]', ''] else 'N/A',
66 | 'pcie_gen_max': parts[21] if parts[21] not in ['N/A', '[N/A]', ''] else 'N/A',
67 | 'pcie_width': parts[22] if parts[22] not in ['N/A', '[N/A]', ''] else 'N/A',
68 | 'pcie_width_max': parts[23] if parts[23] not in ['N/A', '[N/A]', ''] else 'N/A',
69 | 'encoder_sessions': int(parts[24]) if parts[24] not in ['N/A', '[N/A]', ''] else 0,
70 | 'encoder_fps': float(parts[25]) if parts[25] not in ['N/A', '[N/A]', ''] else 0,
71 | 'encoder_latency': float(parts[26]) if parts[26] not in ['N/A', '[N/A]', ''] else 0,
72 | 'decoder_sessions': 0,
73 | 'decoder_fps': 0,
74 | 'decoder_latency': 0,
75 | 'performance_state': parts[27] if len(parts) > 27 and parts[27] not in ['N/A', '[N/A]', ''] else 'N/A',
76 | 'compute_mode': parts[28] if len(parts) > 28 and parts[28] not in ['N/A', '[N/A]', ''] else 'N/A',
77 | 'throttle_reasons': 'None',
78 | 'timestamp': datetime.now().isoformat(),
79 | '_fallback_mode': True
80 | }
81 |
82 | if gpu_data:
83 | logger.debug(f"nvidia-smi returned data for {len(gpu_data)} GPU(s)")
84 | return gpu_data
85 |
86 | except subprocess.TimeoutExpired:
87 | logger.error("nvidia-smi command timed out (>10s)")
88 | return {}
89 | except Exception as e:
90 | logger.error(f"nvidia-smi comprehensive query error: {e}, trying basic query")
91 | return parse_nvidia_smi_fallback()
92 |
93 |
94 | def parse_nvidia_smi_fallback():
95 | """Fallback parser with minimal, widely-supported fields"""
96 | try:
97 | logger.info("Using basic nvidia-smi query (minimal fields)")
98 | result = subprocess.run([
99 | 'nvidia-smi',
100 | '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,'
101 | 'memory.used,memory.total,power.draw,power.limit,fan.speed,'
102 | 'clocks.gr,clocks.sm,clocks.mem,pstate',
103 | '--format=csv,noheader,nounits'
104 | ], capture_output=True, text=True, timeout=10)
105 |
106 | if result.returncode != 0:
107 | logger.error(f"Basic nvidia-smi query also failed (code {result.returncode})")
108 | return {}
109 |
110 | lines = result.stdout.strip().split('\n')
111 | gpu_data = {}
112 |
113 | for line in lines:
114 | if line.strip():
115 | parts = [p.strip() for p in line.split(',')]
116 | if len(parts) >= 14:
117 | gpu_id = parts[0]
118 | gpu_data[gpu_id] = {
119 | 'index': parts[0],
120 | 'name': parts[1],
121 | 'uuid': 'N/A',
122 | 'driver_version': 'N/A',
123 | 'vbios_version': 'N/A',
124 | 'temperature': float(parts[2]) if parts[2] not in ['N/A', '[N/A]', ''] else 0,
125 | 'temperature_memory': 0,
126 | 'utilization': float(parts[3]) if parts[3] not in ['N/A', '[N/A]', ''] else 0,
127 | 'memory_utilization': float(parts[4]) if parts[4] not in ['N/A', '[N/A]', ''] else 0,
128 | 'memory_used': float(parts[5]) if parts[5] not in ['N/A', '[N/A]', ''] else 0,
129 | 'memory_total': float(parts[6]) if parts[6] not in ['N/A', '[N/A]', ''] else 0,
130 | 'memory_free': float(parts[6]) - float(parts[5]) if parts[6] not in ['N/A', '[N/A]', ''] and parts[5] not in ['N/A', '[N/A]', ''] else 0,
131 | 'power_draw': float(parts[7]) if parts[7] not in ['N/A', '[N/A]', ''] else 0,
132 | 'power_limit': float(parts[8]) if parts[8] not in ['N/A', '[N/A]', ''] else 0,
133 | 'power_default_limit': 0,
134 | 'fan_speed': float(parts[9]) if parts[9] not in ['N/A', '[N/A]', ''] else 0,
135 | 'clock_graphics': float(parts[10]) if parts[10] not in ['N/A', '[N/A]', ''] else 0,
136 | 'clock_sm': float(parts[11]) if parts[11] not in ['N/A', '[N/A]', ''] else 0,
137 | 'clock_memory': float(parts[12]) if parts[12] not in ['N/A', '[N/A]', ''] else 0,
138 | 'clock_video': 0,
139 | 'clock_max_graphics': 0,
140 | 'clock_max_sm': 0,
141 | 'clock_max_memory': 0,
142 | 'pcie_gen': 'N/A',
143 | 'pcie_gen_max': 'N/A',
144 | 'pcie_width': 'N/A',
145 | 'pcie_width_max': 'N/A',
146 | 'encoder_sessions': 0,
147 | 'encoder_fps': 0,
148 | 'encoder_latency': 0,
149 | 'decoder_sessions': 0,
150 | 'decoder_fps': 0,
151 | 'decoder_latency': 0,
152 | 'performance_state': parts[13] if parts[13] not in ['N/A', '[N/A]', ''] else 'N/A',
153 | 'compute_mode': 'N/A',
154 | 'throttle_reasons': 'None',
155 | 'timestamp': datetime.now().isoformat(),
156 | '_fallback_mode': True
157 | }
158 |
159 | if gpu_data:
160 | logger.info(f"Basic nvidia-smi query successful - Found {len(gpu_data)} GPU(s)")
161 | return gpu_data
162 |
163 | except Exception as e:
164 | logger.error(f"Basic nvidia-smi query failed: {e}")
165 | return {}
166 |
167 |
--------------------------------------------------------------------------------
/core/monitor.py:
--------------------------------------------------------------------------------
1 | """Async GPU monitoring using NVML"""
2 |
3 | import asyncio
4 | import pynvml
5 | import psutil
6 | import logging
7 | from .metrics import MetricsCollector
8 | from .nvidia_smi_fallback import parse_nvidia_smi
9 | from .config import NVIDIA_SMI
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class GPUMonitor:
15 | """Monitor NVIDIA GPUs using NVML"""
16 |
17 | def __init__(self):
18 | self.running = False
19 | self.gpu_data = {}
20 | self.collector = MetricsCollector()
21 | self.use_smi = {} # Track which GPUs use nvidia-smi (decided at boot)
22 |
23 | try:
24 | pynvml.nvmlInit()
25 | self.initialized = True
26 | version = pynvml.nvmlSystemGetDriverVersion()
27 | if isinstance(version, bytes):
28 | version = version.decode('utf-8')
29 | logger.info(f"NVML initialized - Driver: {version}")
30 |
31 | # Detect which GPUs need nvidia-smi (once at boot)
32 | self._detect_smi_gpus()
33 |
34 | except Exception as e:
35 | logger.error(f"Failed to initialize NVML: {e}")
36 | self.initialized = False
37 |
38 | def _detect_smi_gpus(self):
39 | """Detect which GPUs need nvidia-smi fallback (called once at boot)"""
40 | try:
41 | device_count = pynvml.nvmlDeviceGetCount()
42 | logger.info(f"Detected {device_count} GPU(s)")
43 |
44 | if NVIDIA_SMI:
45 | logger.warning("NVIDIA_SMI=True - Forcing nvidia-smi for all GPUs")
46 | for i in range(device_count):
47 | self.use_smi[str(i)] = True
48 | return
49 |
50 | # Auto-detect per GPU
51 | for i in range(device_count):
52 | gpu_id = str(i)
53 | try:
54 | handle = pynvml.nvmlDeviceGetHandleByIndex(i)
55 | data = self.collector.collect_all(handle, gpu_id)
56 | gpu_name = data.get('name', 'Unknown')
57 |
58 | if 'utilization' not in data or data.get('utilization') is None:
59 | self.use_smi[gpu_id] = True
60 | logger.warning(f"GPU {i} ({gpu_name}): Utilization metric not available via NVML")
61 | logger.warning(f"GPU {i} ({gpu_name}): Switching to nvidia-smi mode")
62 | else:
63 | self.use_smi[gpu_id] = False
64 | logger.info(f"GPU {i} ({gpu_name}): Using NVML (utilization: {data.get('utilization')}%)")
65 |
66 | except Exception as e:
67 | self.use_smi[gpu_id] = True
68 | logger.error(f"GPU {i}: NVML detection failed - {e}")
69 | logger.warning(f"GPU {i}: Falling back to nvidia-smi")
70 |
71 | # Summary
72 | nvml_count = sum(1 for use_smi in self.use_smi.values() if not use_smi)
73 | smi_count = sum(1 for use_smi in self.use_smi.values() if use_smi)
74 | if smi_count > 0:
75 | logger.info(f"Boot detection complete: {nvml_count} GPU(s) using NVML, {smi_count} GPU(s) using nvidia-smi")
76 | else:
77 | logger.info(f"Boot detection complete: All {nvml_count} GPU(s) using NVML")
78 |
79 | except Exception as e:
80 | logger.error(f"Failed to detect GPUs: {e}")
81 |
82 | async def get_gpu_data(self):
83 | """Async collect metrics from all detected GPUs"""
84 | if not self.initialized:
85 | logger.error("Cannot get GPU data - NVML not initialized")
86 | return {}
87 |
88 | try:
89 | device_count = pynvml.nvmlDeviceGetCount()
90 | gpu_data = {}
91 |
92 | # Get nvidia-smi data once if any GPU needs it
93 | smi_data = None
94 | if any(self.use_smi.values()):
95 | try:
96 | # Run nvidia-smi in thread pool to avoid blocking
97 | smi_data = await asyncio.get_event_loop().run_in_executor(
98 | None, parse_nvidia_smi
99 | )
100 | except Exception as e:
101 | logger.error(f"nvidia-smi failed: {e}")
102 |
103 | # Collect GPU data concurrently
104 | tasks = []
105 | for i in range(device_count):
106 | gpu_id = str(i)
107 | if self.use_smi.get(gpu_id, False):
108 | # Use nvidia-smi data
109 | if smi_data and gpu_id in smi_data:
110 | gpu_data[gpu_id] = smi_data[gpu_id]
111 | else:
112 | logger.warning(f"GPU {i}: No data from nvidia-smi")
113 | else:
114 | # Use NVML - run in thread pool to avoid blocking
115 | task = asyncio.get_event_loop().run_in_executor(
116 | None, self._collect_single_gpu, i
117 | )
118 | tasks.append((gpu_id, task))
119 |
120 | # Wait for all NVML tasks to complete
121 | if tasks:
122 | results = await asyncio.gather(*[task for _, task in tasks], return_exceptions=True)
123 | for (gpu_id, _), result in zip(tasks, results):
124 | if isinstance(result, Exception):
125 | logger.error(f"GPU {gpu_id}: Error - {result}")
126 | else:
127 | gpu_data[gpu_id] = result
128 |
129 | if not gpu_data:
130 | logger.error("No GPU data collected from any source")
131 |
132 | self.gpu_data = gpu_data
133 | return gpu_data
134 |
135 | except Exception as e:
136 | logger.error(f"Failed to get GPU data: {e}")
137 | return {}
138 |
139 | def _collect_single_gpu(self, gpu_index):
140 | """Collect data for a single GPU (runs in thread pool)"""
141 | try:
142 | handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
143 | return self.collector.collect_all(handle, str(gpu_index))
144 | except Exception as e:
145 | logger.error(f"GPU {gpu_index}: Error - {e}")
146 | return {}
147 |
148 | async def get_processes(self):
149 | """Async get GPU process information"""
150 | if not self.initialized:
151 | return []
152 |
153 | try:
154 | # Run process collection in thread pool
155 | return await asyncio.get_event_loop().run_in_executor(
156 | None, self._get_processes_sync
157 | )
158 | except Exception as e:
159 | logger.error(f"Error getting processes: {e}")
160 | return []
161 |
162 | def _get_processes_sync(self):
163 | """Synchronous process collection (runs in thread pool)"""
164 | try:
165 | device_count = pynvml.nvmlDeviceGetCount()
166 | all_processes = []
167 | gpu_process_counts = {}
168 |
169 | for i in range(device_count):
170 | try:
171 | handle = pynvml.nvmlDeviceGetHandleByIndex(i)
172 | uuid = pynvml.nvmlDeviceGetUUID(handle)
173 | if isinstance(uuid, bytes):
174 | uuid = uuid.decode('utf-8')
175 |
176 | gpu_id = str(i)
177 | gpu_process_counts[gpu_id] = {'compute': 0, 'graphics': 0}
178 |
179 | try:
180 | procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
181 | gpu_process_counts[gpu_id]['compute'] = len(procs)
182 |
183 | for proc in procs:
184 | all_processes.append({
185 | 'pid': str(proc.pid),
186 | 'name': self._get_process_name(proc.pid),
187 | 'gpu_uuid': uuid,
188 | 'gpu_id': gpu_id,
189 | 'memory': float(proc.usedGpuMemory / (1024 ** 2))
190 | })
191 | except pynvml.NVMLError:
192 | pass
193 |
194 | except pynvml.NVMLError:
195 | continue
196 |
197 | for gpu_id, counts in gpu_process_counts.items():
198 | if gpu_id in self.gpu_data:
199 | self.gpu_data[gpu_id]['compute_processes_count'] = counts['compute']
200 | self.gpu_data[gpu_id]['graphics_processes_count'] = counts['graphics']
201 |
202 | return all_processes
203 |
204 | except Exception as e:
205 | logger.error(f"Error getting processes: {e}")
206 | return []
207 |
208 | def _get_process_name(self, pid):
209 | """Extract readable process name from PID with improved logic"""
210 | try:
211 | p = psutil.Process(pid)
212 |
213 | # First try to get the process name
214 | try:
215 | process_name = p.name()
216 | if process_name and process_name not in ['python', 'python3', 'sh', 'bash']:
217 | return process_name
218 | except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess):
219 | pass
220 |
221 | # Try to get command line for better name extraction
222 | try:
223 | cmdline = p.cmdline()
224 | if cmdline:
225 | # Look for the actual executable or script name
226 | for i, arg in enumerate(cmdline):
227 | if not arg or arg.startswith('-'):
228 | continue
229 |
230 | # Skip common interpreters and shells
231 | if arg in ['python', 'python3', 'node', 'java', 'sh', 'bash', 'zsh']:
232 | continue
233 |
234 | # Extract filename from path
235 | filename = arg.split('/')[-1].split('\\')[-1]
236 |
237 | # Skip if it's still a generic name
238 | if filename in ['python', 'python3', 'node', 'java', 'sh', 'bash']:
239 | continue
240 |
241 | # Found a meaningful name
242 | if filename:
243 | return filename
244 |
245 | # Fallback to first argument if nothing else worked
246 | if cmdline[0]:
247 | return cmdline[0].split('/')[-1].split('\\')[-1]
248 |
249 | except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess):
250 | pass
251 |
252 | # Final fallback
253 | return f'PID:{pid}'
254 |
255 | except (psutil.NoSuchProcess, psutil.ZombieProcess):
256 | return f'PID:{pid}'
257 | except Exception as e:
258 | logger.debug(f"Error getting process name for PID {pid}: {e}")
259 | return f'PID:{pid}'
260 |
261 | async def shutdown(self):
262 | """Async shutdown"""
263 | if self.initialized:
264 | try:
265 | pynvml.nvmlShutdown()
266 | self.initialized = False
267 | logger.info("NVML shutdown")
268 | except Exception as e:
269 | logger.error(f"Error shutting down NVML: {e}")
270 |
271 |
--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | 404 - Page Not Found | GPU Hot
7 |
8 |
9 |
10 |
348 |
349 |
350 |
351 |
352 |
357 |
358 |
359 |
360 |
404
361 |
Page Not Found
362 |
This page doesn't exist. Even the GPU couldn't compute this one.
363 |
Back to Home
364 |
368 |
369 |
370 |
371 |
372 |
379 |
380 |
381 |
382 |
--------------------------------------------------------------------------------
/tests/test_cluster.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Mock GPU cluster for load testing hub mode
4 | Simulates realistic GPU workloads across multiple servers
5 | """
6 |
7 | import time
8 | import random
9 | import asyncio
10 | import json
11 | from datetime import datetime
12 | import argparse
13 | import logging
14 | from fastapi import FastAPI, WebSocket
15 | import uvicorn
16 |
17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(message)s')
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | class MockGPUNode:
22 | """Simulates a GPU node with realistic metrics for load testing"""
23 |
24 | def __init__(self, node_name, gpu_count, port=1312):
25 | self.node_name = node_name
26 | self.gpu_count = gpu_count
27 | self.port = port
28 | self.app = FastAPI(title=f"Mock GPU Node {node_name}")
29 | self.websocket_connections = set()
30 | self.broadcasting = False
31 |
32 | # Initialize per-GPU state for realistic patterns
33 | self.gpu_states = []
34 | for gpu_id in range(gpu_count):
35 | self.gpu_states.append({
36 | 'base_temp': random.randint(45, 55),
37 | 'is_busy': random.random() < 0.4, # 40% of GPUs are busy
38 | 'job_start': time.time() - random.uniform(0, 300), # Random job start time
39 | 'memory': random.choice([12288, 24576]), # Mix of 3080 (12GB) and 3090 (24GB)
40 | 'allocated_memory': 0,
41 | 'clock_base': random.randint(1710, 1890), # Stable boost clock
42 | })
43 |
44 | self.start_time = time.time()
45 |
46 | def _generate_realistic_utilization(self, state, timestamp):
47 | """Generate realistic ML training utilization patterns"""
48 | if not state['is_busy']:
49 | # Idle GPU - occasionally switch to busy
50 | if random.random() < 0.001: # 0.1% chance per update to start job
51 | state['is_busy'] = True
52 | state['job_start'] = timestamp
53 | state['allocated_memory'] = state['memory'] * random.uniform(0.85, 0.95)
54 | return random.uniform(0, 3)
55 |
56 | # Busy GPU - simulate training epoch pattern
57 | job_duration = timestamp - state['job_start']
58 | epoch_time = 120 # 2 minute epochs
59 | epoch_progress = (job_duration % epoch_time) / epoch_time
60 |
61 | # Occasionally finish job
62 | if random.random() < 0.0005: # Job finishes
63 | state['is_busy'] = False
64 | state['allocated_memory'] = 0
65 | return 0
66 |
67 | # Training pattern with data loading dips
68 | if epoch_progress < 0.05: # Warmup phase
69 | return random.gauss(25, 5)
70 | elif epoch_progress > 0.93: # Validation phase
71 | return random.gauss(65, 5)
72 | else: # Main training
73 | base_util = random.gauss(96, 2)
74 | # Data loading dips every ~5 seconds
75 | if (timestamp % 5) < 0.4:
76 | base_util *= 0.75
77 | return max(0, min(100, base_util))
78 |
79 | def generate_gpu_data(self):
80 | """Generate realistic GPU metrics for load testing"""
81 | timestamp = time.time()
82 | gpus = {}
83 | processes = []
84 |
85 | for gpu_id in range(self.gpu_count):
86 | state = self.gpu_states[gpu_id]
87 |
88 | # Realistic utilization pattern
89 | util = self._generate_realistic_utilization(state, timestamp)
90 |
91 | # Memory: allocated at job start, stays constant during training
92 | if state['is_busy']:
93 | mem_used = state['allocated_memory']
94 | else:
95 | mem_used = random.uniform(0, 100) # Minimal idle usage
96 |
97 | # Temperature: correlates with utilization, slow changes
98 | target_temp = state['base_temp'] + (util / 100) * 35
99 | temp_variation = random.gauss(0, 1)
100 | temp = max(30, min(92, target_temp + temp_variation))
101 |
102 | # Power: correlates with utilization
103 | mem_base = state['memory']
104 | max_power = 250 if mem_base == 12288 else 350
105 | power = (util / 100) * max_power * random.uniform(0.85, 1.0)
106 |
107 | # Clock speeds: stable based on load
108 | if util > 50:
109 | clock_graphics = state['clock_base'] + random.randint(-20, 20)
110 | pstate = 'P0'
111 | elif util > 10:
112 | clock_graphics = int(state['clock_base'] * 0.8) + random.randint(-15, 15)
113 | pstate = 'P2'
114 | else:
115 | clock_graphics = random.randint(210, 500)
116 | pstate = 'P8'
117 |
118 | gpus[str(gpu_id)] = {
119 | 'index': gpu_id,
120 | 'name': f'NVIDIA RTX {"3090" if mem_base == 24576 else "3080"}',
121 | 'utilization': round(util, 1),
122 | 'temperature': round(temp, 1),
123 | 'memory_used': round(mem_used, 0),
124 | 'memory_total': mem_base,
125 | 'power_draw': round(power, 1),
126 | 'power_limit': max_power,
127 | 'fan_speed': round(min(100, 30 + max(0, temp - 40) * 1.5)),
128 | 'clock_graphics': clock_graphics,
129 | 'clock_sm': clock_graphics,
130 | 'clock_memory': 9501 if mem_base == 24576 else 9001,
131 | 'pcie_gen': 4,
132 | 'pcie_width': 16,
133 | 'pstate': pstate,
134 | 'encoder_sessions': 0,
135 | 'decoder_sessions': 0,
136 | 'throttle_reasons': []
137 | }
138 |
139 | # Add processes for busy GPUs
140 | if state['is_busy']:
141 | process_count = random.randint(1, 2)
142 | for p in range(process_count):
143 | processes.append({
144 | 'pid': random.randint(1000, 99999),
145 | 'name': random.choice(['python3', 'train.py', 'pytorch', 'python']),
146 | 'gpu_memory': round(mem_used / process_count, 0),
147 | 'gpu_id': gpu_id
148 | })
149 |
150 | # System metrics: correlate with GPU load
151 | avg_gpu_util = sum(g['utilization'] for g in gpus.values()) / len(gpus)
152 | system = {
153 | 'cpu_percent': round(random.gauss(15 + avg_gpu_util * 0.3, 5), 1),
154 | 'memory_percent': round(random.gauss(60, 10), 1),
155 | 'memory_used': round(random.gauss(80, 15), 1),
156 | 'memory_total': 128.0
157 | }
158 |
159 | return {
160 | 'node_name': self.node_name,
161 | 'gpus': gpus,
162 | 'processes': processes,
163 | 'system': system
164 | }
165 |
166 | async def _broadcast_loop(self):
167 | """Background task to broadcast GPU data every 0.5s"""
168 | while self.broadcasting:
169 | try:
170 | data = self.generate_gpu_data()
171 |
172 | # Send to all connected clients
173 | if self.websocket_connections:
174 | disconnected = set()
175 | for websocket in self.websocket_connections:
176 | try:
177 | await websocket.send_text(json.dumps(data))
178 | except:
179 | disconnected.add(websocket)
180 |
181 | # Remove disconnected clients
182 | self.websocket_connections -= disconnected
183 |
184 | except Exception as e:
185 | logger.error(f'[{self.node_name}] Error in broadcast loop: {e}')
186 | await asyncio.sleep(0.5)
187 |
188 | def setup_routes(self):
189 | """Setup WebSocket routes"""
190 |
191 | @self.app.websocket("/socket.io/")
192 | async def websocket_endpoint(websocket: WebSocket):
193 | await websocket.accept()
194 | self.websocket_connections.add(websocket)
195 | logger.info(f'[{self.node_name}] Client connected')
196 |
197 | # Start broadcasting when first client connects
198 | if not self.broadcasting:
199 | self.broadcasting = True
200 | asyncio.create_task(self._broadcast_loop())
201 |
202 | try:
203 | # Keep connection alive
204 | while True:
205 | await websocket.receive_text()
206 | except Exception as e:
207 | logger.debug(f'[{self.node_name}] Client disconnected: {e}')
208 | finally:
209 | self.websocket_connections.discard(websocket)
210 |
211 | async def run(self):
212 | """Run the mock node server"""
213 | self.setup_routes()
214 |
215 | logger.info(f'[{self.node_name}] Starting mock node with {self.gpu_count} GPUs on port {self.port}')
216 |
217 | # Create server config
218 | config = uvicorn.Config(
219 | self.app,
220 | host='0.0.0.0',
221 | port=self.port,
222 | log_level='info',
223 | access_log=False
224 | )
225 | server = uvicorn.Server(config)
226 | await server.serve()
227 |
228 |
229 | async def start_mock_node(node_name, gpu_count, port):
230 | """Start a mock node as async task"""
231 | node = MockGPUNode(node_name, gpu_count, port)
232 | await node.run()
233 |
234 |
235 | async def main():
236 | parser = argparse.ArgumentParser(description='Mock GPU cluster for testing')
237 | parser.add_argument('--nodes', type=str, default='2,4,8',
238 | help='Comma-separated GPU counts for each node (e.g., "2,4,8")')
239 | parser.add_argument('--base-port', type=int, default=13120,
240 | help='Base port for nodes (increments for each node)')
241 | parser.add_argument('--prefix', type=str, default='gpu-server',
242 | help='Prefix for node names')
243 |
244 | args = parser.parse_args()
245 |
246 | gpu_counts = [int(x.strip()) for x in args.nodes.split(',')]
247 |
248 | print("\n" + "="*60)
249 | print("GPU Hot - Mock Cluster Test (FastAPI + AsyncIO)")
250 | print("="*60)
251 | print(f"\nStarting {len(gpu_counts)} mock GPU servers:\n")
252 |
253 | node_urls = []
254 | for i, gpu_count in enumerate(gpu_counts):
255 | port = args.base_port + i
256 | node_name = f"{args.prefix}-{i+1}"
257 | node_urls.append(f"http://localhost:{port}")
258 | print(f" • {node_name}: {gpu_count} GPUs on port {port}")
259 |
260 | print("\n" + "-"*60)
261 | print("Mock nodes running! Now start the hub with:")
262 | print("-"*60)
263 | print(f"\nexport GPU_HOT_MODE=hub")
264 | print(f"export NODE_URLS={','.join(node_urls)}")
265 | print(f"python app.py")
266 | print("\nOr with Docker:")
267 | print(f"\ndocker run -d -p 1312:1312 \\")
268 | print(f" -e GPU_HOT_MODE=hub \\")
269 | print(f" -e NODE_URLS={','.join(node_urls)} \\")
270 | print(f" --network=host \\")
271 | print(f" ghcr.io/psalias2006/gpu-hot:latest")
272 | print("\nThen open: http://localhost:1312")
273 | print("-"*60 + "\n")
274 |
275 | # Start all nodes concurrently
276 | tasks = []
277 | for i, gpu_count in enumerate(gpu_counts):
278 | port = args.base_port + i
279 | node_name = f"{args.prefix}-{i+1}"
280 | task = asyncio.create_task(start_mock_node(node_name, gpu_count, port))
281 | tasks.append(task)
282 |
283 | # Keep all tasks running
284 | try:
285 | await asyncio.gather(*tasks)
286 | except KeyboardInterrupt:
287 | print("\n\nStopping mock cluster...")
288 |
289 |
290 | if __name__ == '__main__':
291 | asyncio.run(main())
292 |
293 |
--------------------------------------------------------------------------------
/static/js/chart-config.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Chart configuration factory - DRY approach for chart configs
3 | */
4 |
5 | // Base chart options shared across all charts
6 | function getBaseChartOptions() {
7 | return {
8 | responsive: true,
9 | maintainAspectRatio: false,
10 | animation: false, // Disable all animations globally
11 | interaction: {
12 | intersect: false,
13 | mode: 'index'
14 | },
15 | layout: {
16 | padding: { left: 0, right: 0, top: 5, bottom: 10 }
17 | },
18 | scales: {
19 | x: {
20 | display: true,
21 | offset: true,
22 | grid: {
23 | display: false,
24 | drawBorder: false,
25 | offset: true
26 | },
27 | ticks: {
28 | color: 'rgba(255, 255, 255, 0.6)',
29 | font: { size: 11, weight: '500' },
30 | maxRotation: 0,
31 | autoSkip: true,
32 | maxTicksLimit: 7,
33 | padding: 8,
34 | align: 'center'
35 | }
36 | },
37 | y: {
38 | min: 0,
39 | grid: {
40 | color: 'rgba(255, 255, 255, 0.08)',
41 | borderDash: [2, 3],
42 | drawBorder: false,
43 | lineWidth: 1
44 | },
45 | ticks: {
46 | color: 'rgba(255, 255, 255, 0.7)',
47 | font: { size: 12, weight: '500' },
48 | padding: 12,
49 | count: 6
50 | }
51 | }
52 | },
53 | plugins: {
54 | legend: {
55 | display: false
56 | },
57 | tooltip: {
58 | backgroundColor: 'rgba(0, 0, 0, 0.9)',
59 | titleColor: '#ffffff',
60 | bodyColor: '#ffffff',
61 | borderWidth: 2,
62 | cornerRadius: 12,
63 | displayColors: true,
64 | padding: 12,
65 | titleFont: { size: 14, weight: 'bold' },
66 | bodyFont: { size: 13 }
67 | }
68 | }
69 | };
70 | }
71 |
72 | // Create a line chart configuration
73 | function createLineChartConfig(options) {
74 | const {
75 | label,
76 | borderColor,
77 | backgroundColor,
78 | yMax,
79 | yStepSize,
80 | yUnit,
81 | tooltipTitle,
82 | tooltipLabel, // Optional: custom label for tooltip (defaults to dataset label)
83 | tooltipAfterLabel,
84 | thresholds = []
85 | } = options;
86 |
87 | const datasets = [{
88 | label: label,
89 | data: [],
90 | borderColor: borderColor,
91 | backgroundColor: backgroundColor,
92 | borderWidth: 2.5,
93 | tension: 0.35,
94 | fill: true,
95 | pointRadius: 0,
96 | pointHitRadius: 12,
97 | pointBackgroundColor: borderColor,
98 | pointBorderColor: '#fff',
99 | pointBorderWidth: 2,
100 | borderCapStyle: 'round',
101 | borderJoinStyle: 'round'
102 | }];
103 |
104 | // Add threshold lines
105 | thresholds.forEach(threshold => {
106 | datasets.push({
107 | label: threshold.label,
108 | data: [],
109 | borderColor: threshold.color,
110 | backgroundColor: 'transparent',
111 | borderWidth: 1,
112 | borderDash: threshold.dash || [5, 5],
113 | pointRadius: 0,
114 | fill: false
115 | });
116 | });
117 |
118 | const config = {
119 | type: 'line',
120 | data: {
121 | labels: [],
122 | datasets: datasets
123 | },
124 | options: getBaseChartOptions()
125 | };
126 |
127 | // Customize Y axis
128 | if (yMax !== undefined) config.options.scales.y.max = yMax;
129 | if (yMax === undefined && options.ySuggestedMax) config.options.scales.y.suggestedMax = options.ySuggestedMax;
130 | if (yStepSize) config.options.scales.y.ticks.stepSize = yStepSize;
131 | if (yUnit) {
132 | config.options.scales.y.ticks.callback = function(value) {
133 | return value + yUnit;
134 | };
135 | }
136 |
137 | // Customize tooltip
138 | config.options.plugins.tooltip.borderColor = borderColor;
139 | config.options.plugins.tooltip.callbacks = {
140 | title: function(context) {
141 | return tooltipTitle;
142 | },
143 | label: function(context) {
144 | const datasetLabel = context.dataset.label || '';
145 | const value = context.parsed.y;
146 | // Skip threshold labels
147 | if (thresholds.some(t => datasetLabel.includes(t.label.split('(')[0]))) {
148 | return datasetLabel;
149 | }
150 | const displayLabel = tooltipLabel || datasetLabel;
151 | return `${displayLabel}: ${value.toFixed(options.decimals || 1)}${yUnit || ''}`;
152 | },
153 | afterLabel: tooltipAfterLabel ? function(context) {
154 | if (thresholds.some(t => context.dataset.label.includes(t.label.split('(')[0]))) {
155 | return null;
156 | }
157 | return tooltipAfterLabel(context.parsed.y);
158 | } : undefined
159 | };
160 |
161 | return config;
162 | }
163 |
164 | // Create multi-line chart (for clocks, pcie, etc)
165 | function createMultiLineChartConfig(options) {
166 | const {
167 | datasets,
168 | yUnit,
169 | tooltipTitle,
170 | showLegend = false,
171 | ySuggestedMax,
172 | decimals = 0
173 | } = options;
174 |
175 | const config = {
176 | type: 'line',
177 | data: {
178 | labels: [],
179 | datasets: datasets.map(ds => ({
180 | label: ds.label,
181 | data: [],
182 | borderColor: ds.color,
183 | backgroundColor: ds.bgColor || `${ds.color}15`,
184 | borderWidth: ds.width || 2.5,
185 | tension: 0.35,
186 | fill: ds.fill !== undefined ? ds.fill : false,
187 | pointRadius: 0,
188 | pointHitRadius: 12,
189 | pointBackgroundColor: ds.color,
190 | pointBorderColor: '#fff',
191 | pointBorderWidth: 2,
192 | borderCapStyle: 'round',
193 | borderJoinStyle: 'round'
194 | }))
195 | },
196 | options: getBaseChartOptions()
197 | };
198 |
199 | // Y axis customization
200 | if (ySuggestedMax) config.options.scales.y.suggestedMax = ySuggestedMax;
201 | if (yUnit) {
202 | config.options.scales.y.ticks.callback = function(value) {
203 | return value.toFixed(decimals) + yUnit;
204 | };
205 | }
206 |
207 | // Legend
208 | if (showLegend) {
209 | config.options.plugins.legend.display = true;
210 | config.options.plugins.legend.position = 'top';
211 | config.options.plugins.legend.align = 'end';
212 | config.options.plugins.legend.labels = {
213 | color: 'rgba(255, 255, 255, 0.8)',
214 | font: { size: 11 },
215 | boxWidth: 10,
216 | boxHeight: 10,
217 | padding: 10,
218 | usePointStyle: true
219 | };
220 | }
221 |
222 | // Tooltip
223 | config.options.plugins.tooltip.borderColor = datasets[0].color;
224 | config.options.plugins.tooltip.callbacks = {
225 | title: function(context) {
226 | return tooltipTitle;
227 | },
228 | label: function(context) {
229 | const label = context.dataset.label || '';
230 | const value = context.parsed.y;
231 | return `${label}: ${value.toFixed(decimals)}${yUnit || ''}`;
232 | }
233 | };
234 |
235 | return config;
236 | }
237 |
238 | // Chart configurations using factory functions
239 | const chartConfigs = {
240 | utilization: createLineChartConfig({
241 | label: 'GPU Utilization',
242 | borderColor: '#4facfe',
243 | backgroundColor: 'rgba(79, 172, 254, 0.15)',
244 | yMax: 100,
245 | yStepSize: 20,
246 | yUnit: '%',
247 | tooltipTitle: 'GPU Utilization',
248 | thresholds: [
249 | { label: 'High Load (80%)', color: 'rgba(250, 112, 154, 0.5)', dash: [5, 5] }
250 | ],
251 | tooltipAfterLabel: (value) => {
252 | if (value > 90) return '🔥 Very High';
253 | if (value > 80) return '⚡ High';
254 | if (value > 50) return '✓ Active';
255 | return '💤 Low';
256 | }
257 | }),
258 |
259 | temperature: createLineChartConfig({
260 | label: 'GPU Temperature',
261 | borderColor: '#f5576c',
262 | backgroundColor: 'rgba(245, 87, 108, 0.15)',
263 | ySuggestedMax: 90,
264 | yStepSize: 15,
265 | yUnit: '°C',
266 | tooltipTitle: 'GPU Temperature',
267 | thresholds: [
268 | { label: 'Warning (75°C)', color: 'rgba(254, 202, 87, 0.6)', dash: [5, 5] },
269 | { label: 'Danger (85°C)', color: 'rgba(250, 112, 154, 0.8)', dash: [10, 5] }
270 | ],
271 | tooltipAfterLabel: (value) => {
272 | if (value > 85) return '🚨 DANGER';
273 | if (value > 75) return '⚠️ Warning';
274 | if (value > 60) return '🌡️ Normal';
275 | return '❄️ Cool';
276 | }
277 | }),
278 |
279 | memory: createLineChartConfig({
280 | label: 'Memory Usage',
281 | borderColor: '#4facfe',
282 | backgroundColor: 'rgba(79, 172, 254, 0.15)',
283 | yMax: 100,
284 | yStepSize: 20,
285 | yUnit: '%',
286 | tooltipTitle: 'VRAM Usage',
287 | thresholds: [
288 | { label: 'High Usage (90%)', color: 'rgba(250, 112, 154, 0.6)', dash: [5, 5] }
289 | ],
290 | tooltipAfterLabel: (value) => {
291 | if (value > 95) return '🚨 Critical';
292 | if (value > 90) return '⚠️ Very High';
293 | if (value > 75) return '📊 High';
294 | return '✓ Normal';
295 | }
296 | }),
297 |
298 | power: createLineChartConfig({
299 | label: 'Power Draw',
300 | borderColor: '#43e97b',
301 | backgroundColor: 'rgba(67, 233, 123, 0.15)',
302 | ySuggestedMax: 200,
303 | yStepSize: 50,
304 | yUnit: ' W',
305 | tooltipTitle: 'Power Draw',
306 | tooltipLabel: 'Power', // Shortened label for tooltip
307 | tooltipAfterLabel: (value) => {
308 | if (value > 200) return '⚡ Maximum Performance';
309 | if (value > 150) return '🔥 High Performance';
310 | if (value > 100) return '💪 Active';
311 | if (value > 50) return '✓ Moderate';
312 | return '💤 Idle';
313 | }
314 | }),
315 |
316 | fanSpeed: createLineChartConfig({
317 | label: 'Fan Speed',
318 | borderColor: '#38bdf8',
319 | backgroundColor: 'rgba(56, 189, 248, 0.15)',
320 | yMax: 100,
321 | yStepSize: 20,
322 | yUnit: '%',
323 | tooltipTitle: 'Fan Speed',
324 | tooltipAfterLabel: (value) => {
325 | if (value > 90) return '🌪️ Maximum';
326 | if (value > 70) return '💨 High';
327 | if (value > 40) return '🌬️ Active';
328 | if (value > 10) return '✓ Low';
329 | return '⏸️ Idle';
330 | }
331 | }),
332 |
333 | clocks: createMultiLineChartConfig({
334 | datasets: [
335 | { label: 'Graphics Clock', color: '#a78bfa', bgColor: 'rgba(167, 139, 250, 0.1)' },
336 | { label: 'SM Clock', color: '#fb923c', bgColor: 'rgba(251, 146, 60, 0.1)' },
337 | { label: 'Memory Clock', color: '#34d399', bgColor: 'rgba(52, 211, 153, 0.1)' }
338 | ],
339 | yUnit: ' MHz',
340 | tooltipTitle: 'Clock Speeds',
341 | showLegend: true,
342 | decimals: 0
343 | }),
344 |
345 | efficiency: createLineChartConfig({
346 | label: 'Power Efficiency',
347 | borderColor: '#fbbf24',
348 | backgroundColor: 'rgba(251, 191, 36, 0.15)',
349 | yUnit: ' %/W',
350 | tooltipTitle: 'Power Efficiency',
351 | tooltipLabel: 'Efficiency', // Shortened label for tooltip
352 | decimals: 2,
353 | tooltipAfterLabel: (value) => {
354 | if (value > 0.8) return '⭐ Excellent';
355 | if (value > 0.5) return '✓ Good';
356 | if (value > 0.3) return '📊 Fair';
357 | if (value > 0.1) return '⚡ Active';
358 | return '💤 Idle';
359 | }
360 | }),
361 |
362 | pcie: createMultiLineChartConfig({
363 | datasets: [
364 | { label: 'RX Throughput', color: '#3b82f6', backgroundColor: 'rgba(59, 130, 246, 0.15)', width: 3, fill: true },
365 | { label: 'TX Throughput', color: '#8b5cf6', backgroundColor: 'rgba(139, 92, 246, 0.15)', width: 3, fill: true }
366 | ],
367 | yUnit: ' KB/s',
368 | tooltipTitle: 'PCIe Throughput',
369 | showLegend: true,
370 | decimals: 0
371 | }),
372 |
373 | appclocks: createMultiLineChartConfig({
374 | datasets: [
375 | { label: 'Graphics Clock', color: '#4facfe', backgroundColor: 'rgba(79, 172, 254, 0.15)', width: 2, fill: true },
376 | { label: 'Memory Clock', color: '#f59e0b', backgroundColor: 'rgba(245, 158, 11, 0.15)', width: 2, fill: true },
377 | { label: 'SM Clock', color: '#ec4899', backgroundColor: 'rgba(236, 72, 153, 0.15)', width: 2, fill: true },
378 | { label: 'Video Clock', color: '#10b981', backgroundColor: 'rgba(16, 185, 129, 0.15)', width: 2, fill: true }
379 | ],
380 | yUnit: ' MHz',
381 | tooltipTitle: 'Application Clocks',
382 | showLegend: true,
383 | decimals: 0
384 | })
385 | };
386 |
387 |
--------------------------------------------------------------------------------
/docs/demo.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | GPU Hot - Interactive Demo
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
29 |
30 |
31 |
32 |
36 |
37 |
38 | ⚠️ Interactive Demo - Simulated Data
39 |
40 |
41 |
42 |
43 |
44 |
Live Monitoring (Demo)
45 |
46 |
Running Demo
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
59 |
60 |
61 |
62 |
77 |
78 |
79 |
80 |
81 |
0%
82 |
System CPU
83 |
Host Processor
84 |
85 |
86 |
87 |
0%
88 |
System RAM
89 |
Host Memory
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
344 |
345 |
346 |
--------------------------------------------------------------------------------
/core/metrics/collector.py:
--------------------------------------------------------------------------------
1 | """GPU metrics collector using NVML"""
2 |
3 | import time
4 | import pynvml
5 | from datetime import datetime
6 | from .utils import safe_get, decode_bytes, to_mib, to_watts
7 |
8 |
9 | class MetricsCollector:
10 | """Collect all available GPU metrics via NVML"""
11 |
12 | def __init__(self):
13 | self.previous_samples = {}
14 | self.last_sample_time = {}
15 |
16 | def collect_all(self, handle, gpu_id):
17 | """Collect all available metrics for a GPU"""
18 | data = {
19 | 'index': gpu_id,
20 | 'timestamp': datetime.now().isoformat()
21 | }
22 | current_time = time.time()
23 |
24 | self._add_basic_info(handle, data)
25 | self._add_performance(handle, data)
26 | self._add_memory(handle, data, gpu_id, current_time)
27 | self._add_power_thermal(handle, data)
28 | self._add_clocks(handle, data)
29 | self._add_connectivity(handle, data)
30 | self._add_media_engines(handle, data)
31 | self._add_health_status(handle, data)
32 | self._add_advanced(handle, data)
33 |
34 | self.previous_samples[gpu_id] = data.copy()
35 | self.last_sample_time[gpu_id] = current_time
36 |
37 | return data
38 |
39 | def _add_basic_info(self, handle, data):
40 | """Basic GPU information"""
41 | if name := safe_get(pynvml.nvmlDeviceGetName, handle):
42 | data['name'] = decode_bytes(name)
43 |
44 | if uuid := safe_get(pynvml.nvmlDeviceGetUUID, handle):
45 | data['uuid'] = decode_bytes(uuid)
46 |
47 | if driver := safe_get(pynvml.nvmlSystemGetDriverVersion):
48 | data['driver_version'] = decode_bytes(driver)
49 |
50 | if vbios := safe_get(pynvml.nvmlDeviceGetVbiosVersion, handle):
51 | data['vbios_version'] = decode_bytes(vbios)
52 |
53 | # Brand and architecture with smart detection
54 | self._detect_brand(handle, data)
55 | self._detect_architecture(handle, data)
56 |
57 | # CUDA capability
58 | if cap := safe_get(pynvml.nvmlDeviceGetCudaComputeCapability, handle):
59 | data['cuda_compute_capability'] = f"{cap[0]}.{cap[1]}"
60 |
61 | # Serial number
62 | if serial := safe_get(pynvml.nvmlDeviceGetSerial, handle):
63 | data['serial'] = decode_bytes(serial)
64 |
65 | def _detect_brand(self, handle, data):
66 | """Detect GPU brand from NVML"""
67 | BRAND_MAP = {
68 | 1: 'GeForce', 2: 'Quadro', 3: 'Tesla',
69 | 4: 'NVS', 5: 'GRID', 6: 'Titan',
70 | 7: 'GeForce GTX', 8: 'GeForce RTX', 9: 'Titan RTX'
71 | }
72 |
73 | if brand := safe_get(pynvml.nvmlDeviceGetBrand, handle):
74 | data['brand'] = BRAND_MAP.get(brand, f'Brand {brand}')
75 |
76 | def _detect_architecture(self, handle, data):
77 | """Detect GPU architecture with fallback to name-based detection"""
78 | ARCH_MAP = {
79 | 0: 'Kepler', 1: 'Maxwell', 2: 'Pascal', 3: 'Volta',
80 | 4: 'Turing', 5: 'Ampere', 6: 'Ada Lovelace', 7: 'Hopper',
81 | 8: 'Ada Lovelace', 9: 'Ada Lovelace' # Driver variations
82 | }
83 |
84 | # Try NVML first
85 | if arch := safe_get(pynvml.nvmlDeviceGetArchitecture, handle):
86 | data['architecture'] = ARCH_MAP.get(arch, self._detect_arch_from_name(data.get('name', '')))
87 | # Fallback to name-based detection
88 | elif 'name' in data:
89 | data['architecture'] = self._detect_arch_from_name(data['name'])
90 |
91 | def _detect_arch_from_name(self, gpu_name):
92 | """Detect architecture from GPU model name"""
93 | name = gpu_name.upper()
94 |
95 | arch_patterns = [
96 | (['RTX 40', 'RTX 4', 'L40', 'L4'], 'Ada Lovelace'),
97 | (['H100', 'H200'], 'Hopper'),
98 | (['RTX 30', 'RTX 3', 'A100', 'A40', 'A30', 'A10', 'A6000', 'A5000', 'A4000', 'A2000'], 'Ampere'),
99 | (['RTX 20', 'RTX 2', 'GTX 16', 'T1000', 'T2000', 'T600'], 'Turing'),
100 | (['GTX 10', 'TITAN X', 'P100', 'P40', 'P6'], 'Pascal'),
101 | (['GTX 9', 'TITAN M', 'M60', 'M40'], 'Maxwell'),
102 | (['GTX 7', 'GTX 6', 'K80', 'K40'], 'Kepler'),
103 | (['V100'], 'Volta'),
104 | ]
105 |
106 | for patterns, arch in arch_patterns:
107 | if any(pattern in name for pattern in patterns):
108 | return arch
109 |
110 | return 'Unknown'
111 |
112 | def _add_performance(self, handle, data):
113 | """Performance metrics"""
114 | # Utilization
115 | if util := safe_get(pynvml.nvmlDeviceGetUtilizationRates, handle):
116 | data['utilization'] = float(util.gpu)
117 | data['memory_utilization'] = float(util.memory)
118 |
119 | # Performance state
120 | if pstate := safe_get(pynvml.nvmlDeviceGetPerformanceState, handle):
121 | data['performance_state'] = f'P{pstate}'
122 |
123 | # Compute mode
124 | if mode := safe_get(pynvml.nvmlDeviceGetComputeMode, handle):
125 | modes = {0: 'Default', 1: 'Exclusive Thread',
126 | 2: 'Prohibited', 3: 'Exclusive Process'}
127 | data['compute_mode'] = modes.get(mode, 'Unknown')
128 |
129 | def _add_memory(self, handle, data, gpu_id, current_time):
130 | """Memory metrics"""
131 | if mem := safe_get(pynvml.nvmlDeviceGetMemoryInfo, handle):
132 | data['memory_used'] = to_mib(mem.used)
133 | data['memory_total'] = to_mib(mem.total)
134 | data['memory_free'] = to_mib(mem.free)
135 |
136 | # Calculate change rate
137 | if gpu_id in self.previous_samples:
138 | prev = self.previous_samples[gpu_id]
139 | if 'memory_used' in prev:
140 | dt = current_time - self.last_sample_time.get(gpu_id, current_time)
141 | if dt > 0:
142 | delta = data['memory_used'] - prev['memory_used']
143 | data['memory_change_rate'] = float(delta / dt)
144 |
145 | # BAR1 memory
146 | if bar1 := safe_get(pynvml.nvmlDeviceGetBAR1MemoryInfo, handle):
147 | data['bar1_memory_used'] = to_mib(bar1.bar1Used)
148 | data['bar1_memory_total'] = to_mib(bar1.bar1Total)
149 |
150 | def _add_power_thermal(self, handle, data):
151 | """Power and thermal metrics"""
152 | self._add_temperature(handle, data)
153 | self._add_power(handle, data)
154 | self._add_fan_speeds(handle, data)
155 | self._add_throttling(handle, data)
156 |
157 | def _add_temperature(self, handle, data):
158 | if temp := safe_get(pynvml.nvmlDeviceGetTemperature, handle, pynvml.NVML_TEMPERATURE_GPU):
159 | data['temperature'] = float(temp)
160 |
161 | if temp_mem := safe_get(pynvml.nvmlDeviceGetTemperature, handle, 1):
162 | if temp_mem > 0:
163 | data['temperature_memory'] = float(temp_mem)
164 |
165 | def _add_power(self, handle, data):
166 | if power := safe_get(pynvml.nvmlDeviceGetPowerUsage, handle):
167 | data['power_draw'] = to_watts(power)
168 |
169 | if limit := safe_get(pynvml.nvmlDeviceGetPowerManagementLimit, handle):
170 | data['power_limit'] = to_watts(limit)
171 |
172 | if constraints := safe_get(pynvml.nvmlDeviceGetPowerManagementLimitConstraints, handle):
173 | if isinstance(constraints, tuple) and len(constraints) >= 2:
174 | data['power_limit_min'] = to_watts(constraints[0])
175 | data['power_limit_max'] = to_watts(constraints[1])
176 |
177 | if energy := safe_get(pynvml.nvmlDeviceGetTotalEnergyConsumption, handle):
178 | data['energy_consumption'] = float(energy) / 1000.0
179 | data['energy_consumption_wh'] = float(energy) / 3600000.0
180 |
181 | def _add_fan_speeds(self, handle, data):
182 | if fan := safe_get(pynvml.nvmlDeviceGetFanSpeed, handle):
183 | data['fan_speed'] = float(fan)
184 |
185 | if hasattr(pynvml, 'nvmlDeviceGetNumFans') and hasattr(pynvml, 'nvmlDeviceGetFanSpeed_v2'):
186 | if num_fans := safe_get(pynvml.nvmlDeviceGetNumFans, handle):
187 | fans = []
188 | for i in range(num_fans):
189 | if speed := safe_get(pynvml.nvmlDeviceGetFanSpeed_v2, handle, i):
190 | fans.append(float(speed))
191 | if fans:
192 | data['fan_speeds'] = fans
193 |
194 | def _add_throttling(self, handle, data):
195 | if throttle := safe_get(pynvml.nvmlDeviceGetCurrentClocksThrottleReasons, handle):
196 | throttle_map = [
197 | (pynvml.nvmlClocksThrottleReasonGpuIdle, 'GPU Idle'),
198 | (pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, 'App Settings'),
199 | (pynvml.nvmlClocksThrottleReasonSwPowerCap, 'SW Power Cap'),
200 | (pynvml.nvmlClocksThrottleReasonHwSlowdown, 'HW Slowdown'),
201 | (pynvml.nvmlClocksThrottleReasonSwThermalSlowdown, 'SW Thermal'),
202 | (pynvml.nvmlClocksThrottleReasonHwThermalSlowdown, 'HW Thermal'),
203 | (pynvml.nvmlClocksThrottleReasonHwPowerBrakeSlowdown, 'Power Brake'),
204 | ]
205 | reasons = [label for flag, label in throttle_map if throttle & flag]
206 | data['throttle_reasons'] = ', '.join(reasons) if reasons else 'None'
207 |
208 | def _add_clocks(self, handle, data):
209 | """Clock speed metrics"""
210 | clock_types = [
211 | ('clock_graphics', pynvml.NVML_CLOCK_GRAPHICS),
212 | ('clock_sm', pynvml.NVML_CLOCK_SM),
213 | ('clock_memory', pynvml.NVML_CLOCK_MEM),
214 | ('clock_video', pynvml.NVML_CLOCK_VIDEO),
215 | ]
216 |
217 | for key, clock_type in clock_types:
218 | # Current clocks
219 | if clock := safe_get(pynvml.nvmlDeviceGetClockInfo, handle, clock_type):
220 | data[key] = float(clock)
221 |
222 | # Max clocks
223 | if max_clock := safe_get(pynvml.nvmlDeviceGetMaxClockInfo, handle, clock_type):
224 | data[f'{key}_max'] = float(max_clock)
225 |
226 | # Application clocks (target clocks set by user/driver)
227 | if app_clock := safe_get(pynvml.nvmlDeviceGetApplicationsClock, handle, clock_type):
228 | data[f'{key}_app'] = float(app_clock)
229 |
230 | # Default application clocks
231 | if default_clock := safe_get(pynvml.nvmlDeviceGetDefaultApplicationsClock, handle, clock_type):
232 | data[f'{key}_default'] = float(default_clock)
233 |
234 | # Supported memory clocks (list of all available clock speeds)
235 | try:
236 | if mem_clocks := safe_get(pynvml.nvmlDeviceGetSupportedMemoryClocks, handle):
237 | if mem_clocks and len(mem_clocks) > 0:
238 | data['supported_memory_clocks'] = [float(c) for c in mem_clocks[:10]] # Limit to first 10
239 | except:
240 | pass
241 |
242 | def _add_connectivity(self, handle, data):
243 | """PCIe and interconnect metrics"""
244 | # PCIe
245 | pcie_metrics = [
246 | ('pcie_gen', pynvml.nvmlDeviceGetCurrPcieLinkGeneration),
247 | ('pcie_gen_max', pynvml.nvmlDeviceGetMaxPcieLinkGeneration),
248 | ('pcie_width', pynvml.nvmlDeviceGetCurrPcieLinkWidth),
249 | ('pcie_width_max', pynvml.nvmlDeviceGetMaxPcieLinkWidth),
250 | ]
251 |
252 | for key, func in pcie_metrics:
253 | if value := safe_get(func, handle):
254 | data[key] = str(value)
255 |
256 | # PCIe throughput
257 | if tx := safe_get(pynvml.nvmlDeviceGetPcieThroughput, handle,
258 | pynvml.NVML_PCIE_UTIL_TX_BYTES):
259 | data['pcie_tx_throughput'] = float(tx)
260 |
261 | if rx := safe_get(pynvml.nvmlDeviceGetPcieThroughput, handle,
262 | pynvml.NVML_PCIE_UTIL_RX_BYTES):
263 | data['pcie_rx_throughput'] = float(rx)
264 |
265 | # PCI info
266 | if pci := safe_get(pynvml.nvmlDeviceGetPciInfo, handle):
267 | data['pci_bus_id'] = decode_bytes(pci.busId)
268 |
269 | def _add_media_engines(self, handle, data):
270 | """Encoder/decoder metrics"""
271 | # Encoder
272 | if enc := safe_get(pynvml.nvmlDeviceGetEncoderUtilization, handle):
273 | if isinstance(enc, tuple) and len(enc) >= 2:
274 | data['encoder_utilization'] = float(enc[0])
275 |
276 | try:
277 | if sessions := pynvml.nvmlDeviceGetEncoderSessions(handle):
278 | data['encoder_sessions'] = len(sessions)
279 | if fps := [s.averageFps for s in sessions if hasattr(s, 'averageFps')]:
280 | data['encoder_fps'] = float(sum(fps) / len(fps))
281 | except:
282 | pass
283 |
284 | # Decoder
285 | if dec := safe_get(pynvml.nvmlDeviceGetDecoderUtilization, handle):
286 | if isinstance(dec, tuple) and len(dec) >= 2:
287 | data['decoder_utilization'] = float(dec[0])
288 |
289 | try:
290 | if sessions := pynvml.nvmlDeviceGetDecoderSessions(handle):
291 | data['decoder_sessions'] = len(sessions)
292 | except:
293 | pass
294 |
295 | def _add_health_status(self, handle, data):
296 | """ECC and health metrics"""
297 | try:
298 | if ecc := pynvml.nvmlDeviceGetEccMode(handle):
299 | if ecc[0]:
300 | data['ecc_enabled'] = True
301 |
302 | # ECC errors
303 | if err := safe_get(pynvml.nvmlDeviceGetTotalEccErrors, handle,
304 | pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED,
305 | pynvml.NVML_VOLATILE_ECC):
306 | data['ecc_errors_corrected'] = int(err)
307 | except:
308 | pass
309 |
310 | # Retired pages
311 | try:
312 | if pages := pynvml.nvmlDeviceGetRetiredPages(handle,
313 | pynvml.NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR):
314 | data['retired_pages'] = len(pages)
315 | except:
316 | pass
317 |
318 | def _add_advanced(self, handle, data):
319 | """Advanced features"""
320 | if mode := safe_get(pynvml.nvmlDeviceGetPersistenceMode, handle):
321 | data['persistence_mode'] = 'Enabled' if mode else 'Disabled'
322 |
323 | if display := safe_get(pynvml.nvmlDeviceGetDisplayActive, handle):
324 | data['display_active'] = bool(display)
325 |
326 | if multi := safe_get(pynvml.nvmlDeviceGetMultiGpuBoard, handle):
327 | data['multi_gpu_board'] = bool(multi)
328 |
329 | if procs := safe_get(pynvml.nvmlDeviceGetGraphicsRunningProcesses, handle, default=[]):
330 | data['graphics_processes_count'] = len(procs)
331 |
332 | self._add_mig_mode(handle, data)
333 | self._add_nvlink(handle, data)
334 |
335 | def _add_mig_mode(self, handle, data):
336 | if hasattr(pynvml, 'nvmlDeviceGetMigMode'):
337 | if mig := safe_get(pynvml.nvmlDeviceGetMigMode, handle):
338 | if isinstance(mig, tuple) and len(mig) >= 2:
339 | data['mig_mode_current'] = 'Enabled' if mig[0] else 'Disabled'
340 | data['mig_mode_pending'] = 'Enabled' if mig[1] else 'Disabled'
341 |
342 | def _add_nvlink(self, handle, data):
343 | if hasattr(pynvml, 'nvmlDeviceGetNvLinkState'):
344 | nvlinks = []
345 | active_count = 0
346 |
347 | for link_id in range(6):
348 | if state := safe_get(pynvml.nvmlDeviceGetNvLinkState, handle, link_id):
349 | link_data = {'id': link_id, 'active': bool(state)}
350 |
351 | if hasattr(pynvml, 'nvmlDeviceGetNvLinkCapability'):
352 | if hasattr(pynvml, 'NVML_NVLINK_CAP_P2P_SUPPORTED'):
353 | if caps := safe_get(pynvml.nvmlDeviceGetNvLinkCapability, handle,
354 | link_id, pynvml.NVML_NVLINK_CAP_P2P_SUPPORTED):
355 | link_data['p2p_supported'] = bool(caps)
356 |
357 | nvlinks.append(link_data)
358 | if state:
359 | active_count += 1
360 | else:
361 | break
362 |
363 | if nvlinks:
364 | data['nvlink_links'] = nvlinks
365 | data['nvlink_active_count'] = active_count
366 |
367 |
--------------------------------------------------------------------------------
/static/js/socket-handlers.js:
--------------------------------------------------------------------------------
1 | /**
2 | * WebSocket event handlers
3 | */
4 |
5 | // WebSocket connection with auto-reconnect
6 | let socket = null;
7 | let reconnectInterval = null;
8 | let reconnectAttempts = 0;
9 | const MAX_RECONNECT_ATTEMPTS = 10;
10 | const RECONNECT_DELAY = 2000; // Start with 2 seconds
11 |
12 | function createWebSocketConnection() {
13 | const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
14 | const ws = new WebSocket(protocol + '//' + window.location.host + '/socket.io/');
15 | return ws;
16 | }
17 |
18 | function connectWebSocket() {
19 | if (socket && (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN)) {
20 | return; // Already connected or connecting
21 | }
22 |
23 | socket = createWebSocketConnection();
24 | setupWebSocketHandlers();
25 | }
26 |
27 | function setupWebSocketHandlers() {
28 | if (!socket) return;
29 |
30 | socket.onopen = handleSocketOpen;
31 | socket.onmessage = handleSocketMessage;
32 | socket.onclose = handleSocketClose;
33 | socket.onerror = handleSocketError;
34 | }
35 |
36 | function handleSocketOpen() {
37 | console.log('Connected to server');
38 | reconnectAttempts = 0;
39 | clearInterval(reconnectInterval);
40 | reconnectInterval = null;
41 |
42 | const statusEl = document.getElementById('connection-status');
43 | if (statusEl) {
44 | statusEl.textContent = 'Connected';
45 | statusEl.style.color = '#43e97b';
46 | }
47 | }
48 |
49 | function handleSocketClose() {
50 | console.log('Disconnected from server');
51 |
52 | const statusEl = document.getElementById('connection-status');
53 | if (statusEl) {
54 | statusEl.textContent = 'Reconnecting...';
55 | statusEl.style.color = '#ffc107';
56 | }
57 |
58 | // Attempt to reconnect
59 | attemptReconnect();
60 | }
61 |
62 | function handleSocketError(error) {
63 | console.error('WebSocket error:', error);
64 | const statusEl = document.getElementById('connection-status');
65 | if (statusEl) {
66 | statusEl.textContent = 'Connection Error';
67 | statusEl.style.color = '#f5576c';
68 | }
69 | }
70 |
71 | function attemptReconnect() {
72 | if (reconnectInterval) return; // Already trying to reconnect
73 |
74 | reconnectInterval = setInterval(() => {
75 | if (reconnectAttempts >= MAX_RECONNECT_ATTEMPTS) {
76 | clearInterval(reconnectInterval);
77 | reconnectInterval = null;
78 | const statusEl = document.getElementById('connection-status');
79 | if (statusEl) {
80 | statusEl.textContent = 'Disconnected - Tap to Reload';
81 | statusEl.style.color = '#f5576c';
82 | statusEl.style.cursor = 'pointer';
83 | statusEl.onclick = () => location.reload();
84 | }
85 | return;
86 | }
87 |
88 | reconnectAttempts++;
89 | console.log(`Reconnection attempt ${reconnectAttempts}/${MAX_RECONNECT_ATTEMPTS}`);
90 | connectWebSocket();
91 | }, RECONNECT_DELAY);
92 | }
93 |
94 | // Initialize connection
95 | connectWebSocket();
96 |
97 | // Performance: Scroll detection to pause DOM updates during scroll
98 | let isScrolling = false;
99 | let scrollTimeout;
100 | const SCROLL_PAUSE_DURATION = 100; // ms to wait after scroll stops before resuming updates
101 |
102 | /**
103 | * Setup scroll event listeners to detect when user is scrolling
104 | * Uses passive listeners for better performance
105 | */
106 | function setupScrollDetection() {
107 | const handleScroll = () => {
108 | isScrolling = true;
109 | clearTimeout(scrollTimeout);
110 | scrollTimeout = setTimeout(() => {
111 | isScrolling = false;
112 | }, SCROLL_PAUSE_DURATION);
113 | };
114 |
115 | // Wait for DOM to be ready
116 | setTimeout(() => {
117 | // Listen to window scroll (primary scroll container)
118 | window.addEventListener('scroll', handleScroll, { passive: true });
119 |
120 | // Also listen to .container as fallback
121 | const container = document.querySelector('.container');
122 | if (container) {
123 | container.addEventListener('scroll', handleScroll, { passive: true });
124 | }
125 | }, 500);
126 | }
127 |
128 | // Initialize scroll detection
129 | setupScrollDetection();
130 |
131 | // Performance: Batched rendering system using requestAnimationFrame
132 | // Batches all DOM updates into a single frame to minimize reflows/repaints
133 | let pendingUpdates = new Map(); // Queue of pending GPU/system updates
134 | let rafScheduled = false; // Flag to prevent duplicate RAF scheduling
135 |
136 | // Performance: Throttle text updates (less critical than charts)
137 | const lastDOMUpdate = {}; // Track last update time per GPU
138 | const DOM_UPDATE_INTERVAL = 1000; // Text/card updates every 1s, charts update every frame
139 |
140 | // Handle incoming GPU data
141 | function handleSocketMessage(event) {
142 | const data = JSON.parse(event.data);
143 | // Hub mode: different data structure with nodes
144 | if (data.mode === 'hub') {
145 | handleClusterData(data);
146 | return;
147 | }
148 |
149 | const overviewContainer = document.getElementById('overview-container');
150 |
151 | // Clear loading state
152 | if (overviewContainer.innerHTML.includes('Loading GPU data')) {
153 | overviewContainer.innerHTML = '';
154 | }
155 |
156 | const gpuCount = Object.keys(data.gpus).length;
157 | const now = Date.now();
158 |
159 | // Performance: Skip ALL DOM updates during active scrolling
160 | if (isScrolling) {
161 | // Still update chart data arrays (lightweight) to maintain continuity
162 | // This ensures no data gaps when scroll ends
163 | Object.keys(data.gpus).forEach(gpuId => {
164 | const gpuInfo = data.gpus[gpuId];
165 | if (!chartData[gpuId]) {
166 | initGPUData(gpuId, {
167 | utilization: gpuInfo.utilization,
168 | temperature: gpuInfo.temperature,
169 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100,
170 | power: gpuInfo.power_draw,
171 | fanSpeed: gpuInfo.fan_speed,
172 | clockGraphics: gpuInfo.clock_graphics,
173 | clockSm: gpuInfo.clock_sm,
174 | clockMemory: gpuInfo.clock_memory
175 | });
176 | }
177 | updateAllChartDataOnly(gpuId, gpuInfo);
178 | });
179 | return; // Exit early - zero DOM work during scroll = smooth 60 FPS
180 | }
181 |
182 | // Process each GPU - queue updates for batched rendering
183 | Object.keys(data.gpus).forEach(gpuId => {
184 | const gpuInfo = data.gpus[gpuId];
185 |
186 | // Initialize chart data structures if first time seeing this GPU
187 | if (!chartData[gpuId]) {
188 | initGPUData(gpuId, {
189 | utilization: gpuInfo.utilization,
190 | temperature: gpuInfo.temperature,
191 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100,
192 | power: gpuInfo.power_draw,
193 | fanSpeed: gpuInfo.fan_speed,
194 | clockGraphics: gpuInfo.clock_graphics,
195 | clockSm: gpuInfo.clock_sm,
196 | clockMemory: gpuInfo.clock_memory
197 | });
198 | }
199 |
200 | // Determine if text/card DOM should update (throttled) or just charts (every frame)
201 | const shouldUpdateDOM = !lastDOMUpdate[gpuId] || (now - lastDOMUpdate[gpuId]) >= DOM_UPDATE_INTERVAL;
202 |
203 | // Queue this GPU's update instead of executing immediately
204 | pendingUpdates.set(gpuId, {
205 | gpuInfo,
206 | shouldUpdateDOM,
207 | now
208 | });
209 |
210 | // Handle initial card creation (can't be batched since we need the DOM element)
211 | const existingOverview = overviewContainer.querySelector(`[data-gpu-id="${gpuId}"]`);
212 | if (!existingOverview) {
213 | overviewContainer.insertAdjacentHTML('beforeend', createOverviewCard(gpuId, gpuInfo));
214 | initOverviewMiniChart(gpuId, gpuInfo.utilization);
215 | lastDOMUpdate[gpuId] = now;
216 | }
217 | });
218 |
219 | // Queue system updates (processes/CPU/RAM) for batching
220 | if (!lastDOMUpdate.system || (now - lastDOMUpdate.system) >= DOM_UPDATE_INTERVAL) {
221 | pendingUpdates.set('_system', {
222 | processes: data.processes,
223 | system: data.system,
224 | now
225 | });
226 | }
227 |
228 | // Schedule single batched render (if not already scheduled)
229 | // This ensures all updates happen in ONE animation frame
230 | if (!rafScheduled && pendingUpdates.size > 0) {
231 | rafScheduled = true;
232 | requestAnimationFrame(processBatchedUpdates);
233 | }
234 |
235 | // Auto-switch to single GPU view if only 1 GPU detected (first time only)
236 | autoSwitchSingleGPU(gpuCount, Object.keys(data.gpus));
237 | }
238 |
239 | /**
240 | * Process all batched updates in a single animation frame
241 | * Called by requestAnimationFrame at optimal timing (~60 FPS)
242 | *
243 | * Performance benefit: All DOM updates execute in ONE layout/paint cycle
244 | * instead of multiple cycles, eliminating layout thrashing
245 | */
246 | function processBatchedUpdates() {
247 | rafScheduled = false;
248 |
249 | // Execute all queued updates in a single batch
250 | pendingUpdates.forEach((update, gpuId) => {
251 | if (gpuId === '_system') {
252 | // System updates (CPU, RAM, processes)
253 | updateProcesses(update.processes);
254 | updateSystemInfo(update.system);
255 | lastDOMUpdate.system = update.now;
256 | } else {
257 | // GPU updates
258 | const { gpuInfo, shouldUpdateDOM, now } = update;
259 |
260 | // Update overview card (always for charts, conditionally for text)
261 | updateOverviewCard(gpuId, gpuInfo, shouldUpdateDOM);
262 | if (shouldUpdateDOM) {
263 | lastDOMUpdate[gpuId] = now;
264 | }
265 |
266 | // Performance: Only update detail view if tab is visible
267 | // Invisible tabs = zero wasted processing
268 | const isDetailTabVisible = currentTab === `gpu-${gpuId}`;
269 | if (isDetailTabVisible || !registeredGPUs.has(gpuId)) {
270 | ensureGPUTab(gpuId, gpuInfo, shouldUpdateDOM && isDetailTabVisible);
271 | }
272 | }
273 | });
274 |
275 | // Clear queue for next batch
276 | pendingUpdates.clear();
277 | }
278 |
279 | /**
280 | * Update chart data arrays without triggering any rendering (used during scroll)
281 | *
282 | * This maintains data continuity during scroll by collecting metrics
283 | * but skips expensive DOM/canvas updates for smooth 60 FPS scrolling
284 | *
285 | * @param {string} gpuId - GPU identifier
286 | * @param {object} gpuInfo - GPU metrics data
287 | */
288 | function updateAllChartDataOnly(gpuId, gpuInfo) {
289 | if (!chartData[gpuId]) return;
290 |
291 | const timestamp = new Date().toLocaleTimeString();
292 | const memory_used = gpuInfo.memory_used || 0;
293 | const memory_total = gpuInfo.memory_total || 1;
294 | const memPercent = (memory_used / memory_total) * 100;
295 | const power_draw = gpuInfo.power_draw || 0;
296 |
297 | // Prepare all metric updates
298 | const metrics = {
299 | utilization: gpuInfo.utilization || 0,
300 | temperature: gpuInfo.temperature || 0,
301 | memory: memPercent,
302 | power: power_draw,
303 | fanSpeed: gpuInfo.fan_speed || 0,
304 | efficiency: power_draw > 0 ? (gpuInfo.utilization || 0) / power_draw : 0
305 | };
306 |
307 | // Update single-line charts
308 | Object.entries(metrics).forEach(([chartType, value]) => {
309 | const data = chartData[gpuId][chartType];
310 | if (!data?.labels || !data?.data) return;
311 |
312 | data.labels.push(timestamp);
313 | data.data.push(Number(value) || 0);
314 |
315 | // Add threshold lines for specific charts
316 | if (chartType === 'utilization' && data.thresholdData) {
317 | data.thresholdData.push(80);
318 | } else if (chartType === 'temperature') {
319 | if (data.warningData) data.warningData.push(75);
320 | if (data.dangerData) data.dangerData.push(85);
321 | } else if (chartType === 'memory' && data.thresholdData) {
322 | data.thresholdData.push(90);
323 | }
324 |
325 | // Maintain rolling window (120 points = 60s at 0.5s interval)
326 | if (data.labels.length > 120) {
327 | data.labels.shift();
328 | data.data.shift();
329 | if (data.thresholdData) data.thresholdData.shift();
330 | if (data.warningData) data.warningData.shift();
331 | if (data.dangerData) data.dangerData.shift();
332 | }
333 | });
334 |
335 | // Update multi-line charts (clocks)
336 | const clocksData = chartData[gpuId].clocks;
337 | if (clocksData?.labels) {
338 | clocksData.labels.push(timestamp);
339 | clocksData.graphicsData.push(gpuInfo.clock_graphics || 0);
340 | clocksData.smData.push(gpuInfo.clock_sm || 0);
341 | clocksData.memoryData.push(gpuInfo.clock_memory || 0);
342 |
343 | if (clocksData.labels.length > 120) {
344 | clocksData.labels.shift();
345 | clocksData.graphicsData.shift();
346 | clocksData.smData.shift();
347 | clocksData.memoryData.shift();
348 | }
349 | }
350 | }
351 |
352 | // Handle page visibility changes (phone lock/unlock, tab switch)
353 | document.addEventListener('visibilitychange', () => {
354 | if (document.visibilityState === 'visible') {
355 | // Page became visible (phone unlocked or tab switched back)
356 | console.log('Page visible - checking connection');
357 | if (!socket || socket.readyState !== WebSocket.OPEN) {
358 | // Connection is closed, reconnect immediately
359 | reconnectAttempts = 0;
360 | clearInterval(reconnectInterval);
361 | reconnectInterval = null;
362 | connectWebSocket();
363 | }
364 | }
365 | });
366 |
367 | // Also handle page focus (additional safety)
368 | window.addEventListener('focus', () => {
369 | if (!socket || socket.readyState !== WebSocket.OPEN) {
370 | console.log('Window focused - checking connection');
371 | reconnectAttempts = 0;
372 | clearInterval(reconnectInterval);
373 | reconnectInterval = null;
374 | connectWebSocket();
375 | }
376 | });
377 |
378 | /**
379 | * Handle cluster/hub mode data
380 | * Data structure: { mode: 'hub', nodes: {...}, cluster_stats: {...} }
381 | */
382 | function handleClusterData(data) {
383 | const overviewContainer = document.getElementById('overview-container');
384 | const now = Date.now();
385 |
386 | // Clear loading state
387 | if (overviewContainer.innerHTML.includes('Loading GPU data')) {
388 | overviewContainer.innerHTML = '';
389 | }
390 |
391 | // Skip DOM updates during scrolling
392 | if (isScrolling) {
393 | // Still update chart data for continuity
394 | Object.entries(data.nodes).forEach(([nodeName, nodeData]) => {
395 | if (nodeData.status === 'online') {
396 | Object.entries(nodeData.gpus).forEach(([gpuId, gpuInfo]) => {
397 | const fullGpuId = `${nodeName}-${gpuId}`;
398 | if (!chartData[fullGpuId]) {
399 | initGPUData(fullGpuId, {
400 | utilization: gpuInfo.utilization,
401 | temperature: gpuInfo.temperature,
402 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100,
403 | power: gpuInfo.power_draw,
404 | fanSpeed: gpuInfo.fan_speed,
405 | clockGraphics: gpuInfo.clock_graphics,
406 | clockSm: gpuInfo.clock_sm,
407 | clockMemory: gpuInfo.clock_memory
408 | });
409 | }
410 | updateAllChartDataOnly(fullGpuId, gpuInfo);
411 | });
412 | }
413 | });
414 | return;
415 | }
416 |
417 | // Render GPUs grouped by node (minimal grouping)
418 | Object.entries(data.nodes).forEach(([nodeName, nodeData]) => {
419 | // Get or create node group container
420 | let nodeGroup = overviewContainer.querySelector(`[data-node="${nodeName}"]`);
421 | if (!nodeGroup) {
422 | overviewContainer.insertAdjacentHTML('beforeend', `
423 |
424 |
${nodeName}
425 |
426 |
427 | `);
428 | nodeGroup = overviewContainer.querySelector(`[data-node="${nodeName}"]`);
429 | }
430 |
431 | const nodeGrid = nodeGroup.querySelector('.node-grid');
432 |
433 | if (nodeData.status === 'online') {
434 | // Node is online - process its GPUs normally
435 | Object.entries(nodeData.gpus).forEach(([gpuId, gpuInfo]) => {
436 | const fullGpuId = `${nodeName}-${gpuId}`;
437 |
438 | // Initialize chart data with current values
439 | if (!chartData[fullGpuId]) {
440 | initGPUData(fullGpuId, {
441 | utilization: gpuInfo.utilization,
442 | temperature: gpuInfo.temperature,
443 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100,
444 | power: gpuInfo.power_draw,
445 | fanSpeed: gpuInfo.fan_speed,
446 | clockGraphics: gpuInfo.clock_graphics,
447 | clockSm: gpuInfo.clock_sm,
448 | clockMemory: gpuInfo.clock_memory
449 | });
450 | }
451 |
452 | // Queue update
453 | const shouldUpdateDOM = !lastDOMUpdate[fullGpuId] || (now - lastDOMUpdate[fullGpuId]) >= DOM_UPDATE_INTERVAL;
454 | pendingUpdates.set(fullGpuId, {
455 | gpuInfo,
456 | shouldUpdateDOM,
457 | now,
458 | nodeName
459 | });
460 |
461 | // Create card if doesn't exist
462 | const existingCard = nodeGrid.querySelector(`[data-gpu-id="${fullGpuId}"]`);
463 | if (!existingCard) {
464 | nodeGrid.insertAdjacentHTML('beforeend', createClusterGPUCard(nodeName, gpuId, gpuInfo));
465 | initOverviewMiniChart(fullGpuId, gpuInfo.utilization);
466 | lastDOMUpdate[fullGpuId] = now;
467 | }
468 | });
469 | } else {
470 | // Node is offline - remove entire node group
471 | const existingCards = nodeGrid.querySelectorAll('[data-gpu-id]');
472 | existingCards.forEach(card => {
473 | const gpuId = card.getAttribute('data-gpu-id');
474 | // Clean up chart data
475 | if (chartData[gpuId]) {
476 | delete chartData[gpuId];
477 | }
478 | if (lastDOMUpdate[gpuId]) {
479 | delete lastDOMUpdate[gpuId];
480 | }
481 | // Remove the GPU tab
482 | removeGPUTab(gpuId);
483 | });
484 |
485 | // Remove the entire node group from the UI
486 | nodeGroup.remove();
487 | }
488 | });
489 |
490 | // Update processes and system info (use first online node)
491 | const firstOnlineNode = Object.values(data.nodes).find(n => n.status === 'online');
492 | if (firstOnlineNode) {
493 | if (!lastDOMUpdate.system || (now - lastDOMUpdate.system) >= DOM_UPDATE_INTERVAL) {
494 | pendingUpdates.set('_system', {
495 | processes: firstOnlineNode.processes || [],
496 | system: firstOnlineNode.system || {},
497 | now
498 | });
499 | }
500 | }
501 |
502 | // Schedule batched render
503 | if (!rafScheduled && pendingUpdates.size > 0) {
504 | rafScheduled = true;
505 | requestAnimationFrame(processBatchedUpdates);
506 | }
507 | }
508 |
509 | /**
510 | * Create GPU card for cluster view (includes node name)
511 | */
512 | function createClusterGPUCard(nodeName, gpuId, gpuInfo) {
513 | const fullGpuId = `${nodeName}-${gpuId}`;
514 | const memory_used = getMetricValue(gpuInfo, 'memory_used', 0);
515 | const memory_total = getMetricValue(gpuInfo, 'memory_total', 1);
516 | const memPercent = (memory_used / memory_total) * 100;
517 |
518 | return `
519 |
520 |
532 |
533 |
534 |
535 |
${getMetricValue(gpuInfo, 'utilization', 0)}%
536 |
GPU Usage
537 |
538 |
539 |
${getMetricValue(gpuInfo, 'temperature', 0)}°C
540 |
Temperature
541 |
542 |
543 |
${Math.round(memPercent)}%
544 |
Memory
545 |
546 |
547 |
${getMetricValue(gpuInfo, 'power_draw', 0).toFixed(0)}W
548 |
Power Draw
549 |
550 |
551 |
552 |
557 |
558 | `;
559 | }
560 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | GPU Hot - Real-time NVIDIA GPU Monitoring
7 |
8 |
9 |
10 |
11 |
604 |
605 |
606 |
607 |
608 |
614 |
615 |
616 |
617 | GPU Hot
Metrics in Seconds
618 | Real-time GPU monitoring in your browser. Start with one server, scale to dozens. No infrastructure, no setup, no SSH. Just one command.
619 |
633 |
634 |
635 |
636 |
654 |
655 |
656 |
657 |
669 |
670 |
717 |
718 |
719 |
--------------------------------------------------------------------------------
/static/js/chart-manager.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Chart management - data storage, updates, and initialization
3 | * Requires: chart-config.js to be loaded first
4 | */
5 |
6 | // Detect if we're on a mobile device
7 | function isMobile() {
8 | return window.innerWidth <= 768;
9 | }
10 |
11 | // Get mobile-optimized chart options
12 | function getMobileChartOptions(baseOptions) {
13 | if (!isMobile()) return baseOptions;
14 |
15 | // Clone the options to avoid mutating the base config
16 | const mobileOptions = JSON.parse(JSON.stringify(baseOptions));
17 |
18 | const isVerySmall = window.innerWidth <= 375;
19 |
20 | // Simplify axes for mobile - minimal but readable
21 | if (mobileOptions.scales) {
22 | if (mobileOptions.scales.x) {
23 | mobileOptions.scales.x.display = false; // Hide x-axis time labels
24 | }
25 | if (mobileOptions.scales.y) {
26 | // Keep y-axis visible and simple
27 | mobileOptions.scales.y.display = true;
28 | mobileOptions.scales.y.ticks = mobileOptions.scales.y.ticks || {};
29 | mobileOptions.scales.y.ticks.font = { size: isVerySmall ? 8 : 9 };
30 | mobileOptions.scales.y.ticks.padding = 3;
31 | mobileOptions.scales.y.ticks.color = 'rgba(255, 255, 255, 0.5)';
32 | mobileOptions.scales.y.ticks.maxTicksLimit = 3;
33 | mobileOptions.scales.y.grid = mobileOptions.scales.y.grid || {};
34 | mobileOptions.scales.y.grid.color = 'rgba(255, 255, 255, 0.08)';
35 | mobileOptions.scales.y.grid.lineWidth = 1;
36 | mobileOptions.scales.y.grid.drawBorder = true;
37 | }
38 | }
39 |
40 | // Keep tooltips but simplify them
41 | if (mobileOptions.plugins && mobileOptions.plugins.tooltip) {
42 | mobileOptions.plugins.tooltip.enabled = true;
43 | mobileOptions.plugins.tooltip.padding = 8;
44 | mobileOptions.plugins.tooltip.titleFont = { size: 11 };
45 | mobileOptions.plugins.tooltip.bodyFont = { size: 10 };
46 | }
47 |
48 | // Hide legends on mobile
49 | if (mobileOptions.plugins && mobileOptions.plugins.legend) {
50 | mobileOptions.plugins.legend.display = false;
51 | }
52 |
53 | // Keep some padding so chart renders properly
54 | if (mobileOptions.layout && mobileOptions.layout.padding) {
55 | mobileOptions.layout.padding = { left: 10, right: 15, top: 5, bottom: 10 };
56 | }
57 |
58 | // Ensure chart renders
59 | mobileOptions.responsive = true;
60 | mobileOptions.maintainAspectRatio = false;
61 |
62 | return mobileOptions;
63 | }
64 |
65 | // Store charts and data
66 | const charts = {};
67 | const chartData = {};
68 |
69 | // Initialize chart data for a GPU with pre-filled baseline data
70 | function initGPUData(gpuId, initialValues = {}) {
71 | const dataPoints = 120; // 60 seconds at 0.5s interval
72 | const labels = [];
73 |
74 | // Create labels for the full timeline
75 | for (let i = dataPoints - 1; i >= 0; i--) {
76 | const time = new Date(Date.now() - i * 500);
77 | labels.push(time.toLocaleTimeString());
78 | }
79 |
80 | // Helper to create filled array with initial value
81 | const createFilledArray = (value = 0) => new Array(dataPoints).fill(value);
82 |
83 | chartData[gpuId] = {
84 | utilization: {
85 | labels: [...labels],
86 | data: createFilledArray(initialValues.utilization || 0),
87 | thresholdData: createFilledArray(80)
88 | },
89 | temperature: {
90 | labels: [...labels],
91 | data: createFilledArray(initialValues.temperature || 0),
92 | warningData: createFilledArray(75),
93 | dangerData: createFilledArray(85)
94 | },
95 | memory: {
96 | labels: [...labels],
97 | data: createFilledArray(initialValues.memory || 0),
98 | thresholdData: createFilledArray(90)
99 | },
100 | power: {
101 | labels: [...labels],
102 | data: createFilledArray(initialValues.power || 0)
103 | },
104 | fanSpeed: {
105 | labels: [...labels],
106 | data: createFilledArray(initialValues.fanSpeed || 0)
107 | },
108 | clocks: {
109 | labels: [...labels],
110 | graphicsData: createFilledArray(initialValues.clockGraphics || 0),
111 | smData: createFilledArray(initialValues.clockSm || 0),
112 | memoryData: createFilledArray(initialValues.clockMemory || 0)
113 | },
114 | efficiency: {
115 | labels: [...labels],
116 | data: createFilledArray(initialValues.efficiency || 0)
117 | },
118 | pcie: {
119 | labels: [...labels],
120 | dataRX: createFilledArray(initialValues.pcieRX || 0),
121 | dataTX: createFilledArray(initialValues.pcieTX || 0)
122 | },
123 | appclocks: {
124 | labels: [...labels],
125 | dataGr: createFilledArray(initialValues.appclockGr || 0),
126 | dataMem: createFilledArray(initialValues.appclockMem || 0),
127 | dataSM: createFilledArray(initialValues.appclockSM || 0),
128 | dataVideo: createFilledArray(initialValues.appclockVideo || 0)
129 | }
130 | };
131 | }
132 |
133 | // Calculate statistics for chart data
134 | function calculateStats(data) {
135 | if (!data || !Array.isArray(data) || data.length === 0) {
136 | return { min: 0, max: 0, avg: 0, current: 0 };
137 | }
138 |
139 | // Filter out invalid numbers
140 | const validData = data.filter(val => isFinite(val));
141 | if (validData.length === 0) {
142 | return { min: 0, max: 0, avg: 0, current: 0 };
143 | }
144 |
145 | const current = validData[validData.length - 1];
146 | const min = Math.min(...validData);
147 | const max = Math.max(...validData);
148 | const avg = validData.reduce((a, b) => a + b, 0) / validData.length;
149 |
150 | return {
151 | min: isFinite(min) ? min : 0,
152 | max: isFinite(max) ? max : 0,
153 | avg: isFinite(avg) ? avg : 0,
154 | current: isFinite(current) ? current : 0
155 | };
156 | }
157 |
158 | // Update statistics display for a chart
159 | function updateChartStats(gpuId, chartType, stats, unit) {
160 | const currentEl = document.getElementById(`stat-${chartType}-current-${gpuId}`);
161 | const minEl = document.getElementById(`stat-${chartType}-min-${gpuId}`);
162 | const maxEl = document.getElementById(`stat-${chartType}-max-${gpuId}`);
163 | const avgEl = document.getElementById(`stat-${chartType}-avg-${gpuId}`);
164 |
165 | // Use decimal formatting for efficiency values
166 | const formatter = (value) => {
167 | if (chartType === 'efficiency') {
168 | return value.toFixed(2);
169 | }
170 | return Math.round(value);
171 | };
172 |
173 | if (currentEl) currentEl.textContent = `${formatter(stats.current)}${unit}`;
174 | if (minEl) minEl.textContent = `${formatter(stats.min)}${unit}`;
175 | if (maxEl) maxEl.textContent = `${formatter(stats.max)}${unit}`;
176 | if (avgEl) avgEl.textContent = `${formatter(stats.avg)}${unit}`;
177 | }
178 |
179 | // Update statistics display for PCIe chart (RX and TX separately)
180 | function updatePCIeChartStats(gpuId, statsRX, statsTX) {
181 | // Smart formatter that converts KB/s to MB/s when >= 1000
182 | const formatBandwidth = (value) => {
183 | if (value >= 1000) {
184 | return `${(value / 1024).toFixed(1)} MB/s`;
185 | }
186 | return `${Math.round(value)} KB/s`;
187 | };
188 |
189 | // Update RX stats
190 | const rxCurrentEl = document.getElementById(`stat-pcie-rx-current-${gpuId}`);
191 | const rxMinEl = document.getElementById(`stat-pcie-rx-min-${gpuId}`);
192 | const rxMaxEl = document.getElementById(`stat-pcie-rx-max-${gpuId}`);
193 | const rxAvgEl = document.getElementById(`stat-pcie-rx-avg-${gpuId}`);
194 |
195 | if (rxCurrentEl) rxCurrentEl.textContent = formatBandwidth(statsRX.current);
196 | if (rxMinEl) rxMinEl.textContent = formatBandwidth(statsRX.min);
197 | if (rxMaxEl) rxMaxEl.textContent = formatBandwidth(statsRX.max);
198 | if (rxAvgEl) rxAvgEl.textContent = formatBandwidth(statsRX.avg);
199 |
200 | // Update TX stats
201 | const txCurrentEl = document.getElementById(`stat-pcie-tx-current-${gpuId}`);
202 | const txMinEl = document.getElementById(`stat-pcie-tx-min-${gpuId}`);
203 | const txMaxEl = document.getElementById(`stat-pcie-tx-max-${gpuId}`);
204 | const txAvgEl = document.getElementById(`stat-pcie-tx-avg-${gpuId}`);
205 |
206 | if (txCurrentEl) txCurrentEl.textContent = formatBandwidth(statsTX.current);
207 | if (txMinEl) txMinEl.textContent = formatBandwidth(statsTX.min);
208 | if (txMaxEl) txMaxEl.textContent = formatBandwidth(statsTX.max);
209 | if (txAvgEl) txAvgEl.textContent = formatBandwidth(statsTX.avg);
210 | }
211 |
212 | // Update mobile chart header value display
213 | function updateMobileChartValue(gpuId, chartType, value, unit) {
214 | const chartHeader = document.querySelector(`#chart-${chartType}-${gpuId}`)?.closest('.chart-container')?.querySelector('.chart-header');
215 | if (chartHeader) {
216 | const formattedValue = chartType === 'efficiency' ? value.toFixed(2) : Math.round(value);
217 | chartHeader.setAttribute('data-value', `${formattedValue}${unit}`);
218 | }
219 | }
220 |
221 | // Update chart data
222 | function updateChart(gpuId, chartType, value, value2, value3, value4) {
223 | // Validate inputs
224 | if (!gpuId || !chartType) {
225 | console.warn('updateChart: Missing gpuId or chartType');
226 | return;
227 | }
228 |
229 | if (!chartData[gpuId]) initGPUData(gpuId);
230 |
231 | const data = chartData[gpuId][chartType];
232 | if (!data) {
233 | console.warn(`updateChart: Invalid chartType "${chartType}" for GPU ${gpuId}`);
234 | return;
235 | }
236 |
237 | const now = new Date().toLocaleTimeString();
238 |
239 | data.labels.push(now);
240 |
241 | // Safe number conversion helper
242 | const safeNumber = (val) => {
243 | const num = Number(val);
244 | return (isFinite(num) && num >= 0) ? num : 0;
245 | };
246 |
247 | // Handle multi-value charts
248 | if (chartType === 'clocks') {
249 | data.graphicsData.push(safeNumber(value));
250 | data.smData.push(safeNumber(value2));
251 | data.memoryData.push(safeNumber(value3));
252 | } else if (chartType === 'pcie') {
253 | data.dataRX.push(safeNumber(value));
254 | data.dataTX.push(safeNumber(value2));
255 | } else if (chartType === 'appclocks') {
256 | data.dataGr.push(safeNumber(value));
257 | data.dataMem.push(safeNumber(value2));
258 | data.dataSM.push(safeNumber(value3));
259 | data.dataVideo.push(safeNumber(value4));
260 | } else {
261 | data.data.push(safeNumber(value));
262 | }
263 |
264 | // Add threshold data based on chart type
265 | if (chartType === 'utilization') {
266 | data.thresholdData.push(80); // High load threshold at 80%
267 | } else if (chartType === 'temperature') {
268 | data.warningData.push(75); // Warning at 75°C
269 | data.dangerData.push(85); // Danger at 85°C
270 | } else if (chartType === 'memory') {
271 | data.thresholdData.push(90); // High usage at 90%
272 | }
273 |
274 | // Keep only last 120 data points (60 seconds at 0.5s interval)
275 | if (data.labels.length > 120) {
276 | data.labels.shift();
277 | if (data.data) data.data.shift();
278 | if (data.graphicsData) data.graphicsData.shift();
279 | if (data.smData) data.smData.shift();
280 | if (data.memoryData) data.memoryData.shift();
281 | if (data.dataRX) data.dataRX.shift();
282 | if (data.dataTX) data.dataTX.shift();
283 | if (data.dataGr) data.dataGr.shift();
284 | if (data.dataMem) data.dataMem.shift();
285 | if (data.dataSM) data.dataSM.shift();
286 | if (data.dataVideo) data.dataVideo.shift();
287 | if (data.thresholdData) data.thresholdData.shift();
288 | if (data.warningData) data.warningData.shift();
289 | if (data.dangerData) data.dangerData.shift();
290 | }
291 |
292 | // Calculate and update statistics
293 | if (chartType === 'pcie') {
294 | // Handle PCIe separately - need stats for both RX and TX
295 | const statsRX = calculateStats(data.dataRX);
296 | const statsTX = calculateStats(data.dataTX);
297 | updatePCIeChartStats(gpuId, statsRX, statsTX);
298 | } else {
299 | let statsData = data.data;
300 | if (chartType === 'clocks') statsData = data.graphicsData;
301 | else if (chartType === 'appclocks') statsData = data.dataGr;
302 |
303 | const stats = calculateStats(statsData);
304 | const unitMap = {
305 | 'utilization': '%',
306 | 'util': '%',
307 | 'temperature': '°C',
308 | 'temp': '°C',
309 | 'memory': '%',
310 | 'power': 'W',
311 | 'fanSpeed': '%',
312 | 'clocks': ' MHz',
313 | 'efficiency': ' %/W',
314 | 'appclocks': ' MHz'
315 | };
316 | const unit = unitMap[chartType] || '';
317 | updateChartStats(gpuId, chartType, stats, unit);
318 |
319 | // Update mobile chart header with current value
320 | if (isMobile()) {
321 | updateMobileChartValue(gpuId, chartType, stats.current, unit);
322 | }
323 | }
324 |
325 | // Update chart if it exists with error handling
326 | if (charts[gpuId] && charts[gpuId][chartType]) {
327 | try {
328 | charts[gpuId][chartType].update('none');
329 | } catch (error) {
330 | console.error(`Error updating chart ${chartType} for GPU ${gpuId}:`, error);
331 | }
332 | }
333 | }
334 |
335 | // Initialize utilization background chart
336 | function initUtilBackgroundChart(gpuId) {
337 | const canvas = document.getElementById(`util-bg-chart-${gpuId}`);
338 | if (!canvas) return;
339 |
340 | if (!charts[gpuId]) charts[gpuId] = {};
341 | if (charts[gpuId].utilBackground) return; // Already initialized
342 |
343 | charts[gpuId].utilBackground = new Chart(canvas, {
344 | type: 'line',
345 | data: {
346 | labels: chartData[gpuId].utilization.labels,
347 | datasets: [{
348 | data: chartData[gpuId].utilization.data,
349 | borderColor: 'rgba(79, 172, 254, 0.8)',
350 | backgroundColor: 'rgba(79, 172, 254, 0.3)',
351 | borderWidth: 2,
352 | tension: 0.4,
353 | fill: true,
354 | pointRadius: 0
355 | }]
356 | },
357 | options: {
358 | responsive: true,
359 | maintainAspectRatio: false,
360 | animation: false,
361 | scales: {
362 | x: { display: false },
363 | y: { display: false, min: 0, max: 100 }
364 | },
365 | plugins: {
366 | legend: { display: false },
367 | tooltip: { enabled: false }
368 | }
369 | }
370 | });
371 | }
372 |
373 | // Initialize charts for a GPU
374 | function initGPUCharts(gpuId) {
375 | if (!gpuId) {
376 | console.warn('initGPUCharts: Missing gpuId');
377 | return;
378 | }
379 |
380 | const chartTypes = ['utilization', 'temperature', 'memory', 'power', 'fanSpeed', 'clocks', 'efficiency', 'pcie', 'appclocks'];
381 | if (!charts[gpuId]) charts[gpuId] = {};
382 |
383 | // Initialize background utilization chart
384 | initUtilBackgroundChart(gpuId);
385 |
386 | chartTypes.forEach(type => {
387 | const canvas = document.getElementById(`chart-${type}-${gpuId}`);
388 | if (!canvas) return;
389 |
390 | // Destroy existing chart to prevent memory leaks
391 | if (charts[gpuId][type]) {
392 | try {
393 | charts[gpuId][type].destroy();
394 | } catch (error) {
395 | console.warn(`Error destroying existing chart ${type} for GPU ${gpuId}:`, error);
396 | }
397 | }
398 |
399 | if (canvas) {
400 | const config = JSON.parse(JSON.stringify(chartConfigs[type])); // Deep clone
401 |
402 | // Link datasets to chartData FIRST
403 | if (type === 'utilization') {
404 | config.data.datasets[0].data = chartData[gpuId][type].data;
405 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].thresholdData;
406 | } else if (type === 'temperature') {
407 | config.data.datasets[0].data = chartData[gpuId][type].data;
408 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].warningData;
409 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].dangerData;
410 | } else if (type === 'memory') {
411 | config.data.datasets[0].data = chartData[gpuId][type].data;
412 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].thresholdData;
413 | } else if (type === 'clocks') {
414 | config.data.datasets[0].data = chartData[gpuId][type].graphicsData;
415 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].smData;
416 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].memoryData;
417 | } else if (type === 'pcie') {
418 | config.data.datasets[0].data = chartData[gpuId][type].dataRX;
419 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].dataTX;
420 | } else if (type === 'appclocks') {
421 | config.data.datasets[0].data = chartData[gpuId][type].dataGr;
422 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].dataMem;
423 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].dataSM;
424 | if (config.data.datasets[3]) config.data.datasets[3].data = chartData[gpuId][type].dataVideo;
425 | } else {
426 | config.data.datasets[0].data = chartData[gpuId][type].data;
427 | }
428 |
429 | config.data.labels = chartData[gpuId][type].labels;
430 |
431 | // Optimize dataset appearance for mobile (BEFORE applying options)
432 | if (isMobile() && config.data.datasets) {
433 | // Make first dataset prominent
434 | config.data.datasets[0].borderWidth = 3;
435 | config.data.datasets[0].pointRadius = 0;
436 | config.data.datasets[0].fill = true;
437 |
438 | // Hide other datasets by making them invisible (don't remove them!)
439 | for (let i = 1; i < config.data.datasets.length; i++) {
440 | config.data.datasets[i].hidden = true;
441 | config.data.datasets[i].borderWidth = 0;
442 | }
443 | }
444 |
445 | // Apply mobile optimizations to chart options
446 | config.options = getMobileChartOptions(config.options);
447 |
448 | // Ensure canvas has proper dimensions before creating chart
449 | const parent = canvas.parentElement;
450 | if (parent && parent.clientWidth > 0 && parent.clientHeight > 0) {
451 | // Set canvas dimensions to match container
452 | canvas.style.width = '100%';
453 | canvas.style.height = '100%';
454 | }
455 |
456 | // Create chart with error handling
457 | try {
458 | charts[gpuId][type] = new Chart(canvas, config);
459 | } catch (error) {
460 | console.error(`Error creating chart ${type} for GPU ${gpuId}:`, error);
461 | }
462 | }
463 | });
464 | }
465 |
466 | // Initialize overview mini chart
467 | function initOverviewMiniChart(gpuId, currentValue) {
468 | if (!gpuId) {
469 | console.warn('initOverviewMiniChart: Missing gpuId');
470 | return;
471 | }
472 |
473 | const canvas = document.getElementById(`overview-chart-${gpuId}`);
474 | if (!canvas) return;
475 |
476 | // Destroy existing chart to prevent memory leaks
477 | if (charts[gpuId] && charts[gpuId].overviewMini) {
478 | try {
479 | charts[gpuId].overviewMini.destroy();
480 | } catch (error) {
481 | console.warn(`Error destroying existing overview chart for GPU ${gpuId}:`, error);
482 | }
483 | }
484 |
485 | // Initialize with current utilization value if not already initialized
486 | if (!chartData[gpuId]) {
487 | initGPUData(gpuId, { utilization: currentValue });
488 | }
489 |
490 | // Mobile-specific configuration for mini charts
491 | const fontSize = isMobile() ? 8 : 10;
492 | const yAxisDisplay = !isMobile() || window.innerWidth > 480;
493 |
494 | const config = {
495 | type: 'line',
496 | data: {
497 | labels: chartData[gpuId].utilization.labels,
498 | datasets: [{
499 | data: chartData[gpuId].utilization.data,
500 | borderColor: '#4facfe',
501 | backgroundColor: 'rgba(79, 172, 254, 0.15)',
502 | borderWidth: isMobile() ? 2 : 2.5,
503 | tension: 0.4,
504 | fill: true,
505 | pointRadius: 0,
506 | pointHoverRadius: 3
507 | }]
508 | },
509 | options: {
510 | responsive: true,
511 | maintainAspectRatio: false,
512 | animation: false, // Disable animations for overview charts
513 | interaction: { mode: 'index', intersect: false },
514 | scales: {
515 | x: { display: false },
516 | y: {
517 | min: 0,
518 | max: 100,
519 | display: yAxisDisplay,
520 | grid: {
521 | color: 'rgba(255, 255, 255, 0.08)',
522 | drawBorder: false
523 | },
524 | ticks: {
525 | color: 'rgba(255, 255, 255, 0.4)',
526 | font: { size: fontSize },
527 | stepSize: 50,
528 | callback: value => value + '%'
529 | }
530 | }
531 | },
532 | plugins: {
533 | legend: { display: false },
534 | tooltip: {
535 | enabled: true,
536 | backgroundColor: 'rgba(0, 0, 0, 0.9)',
537 | padding: isMobile() ? 8 : 12,
538 | cornerRadius: 8,
539 | titleFont: { size: isMobile() ? 11 : 12 },
540 | bodyFont: { size: isMobile() ? 10 : 11 },
541 | callbacks: {
542 | label: context => `GPU: ${context.parsed.y.toFixed(1)}%`
543 | }
544 | }
545 | }
546 | }
547 | };
548 |
549 | if (!charts[gpuId]) charts[gpuId] = {};
550 |
551 | try {
552 | charts[gpuId].overviewMini = new Chart(canvas, config);
553 | } catch (error) {
554 | console.error(`Error creating overview mini chart for GPU ${gpuId}:`, error);
555 | }
556 | }
557 |
558 | // System charts
559 | const systemCharts = {};
560 | const systemData = {
561 | cpu: { labels: [], data: [] },
562 | memory: { labels: [], data: [] }
563 | };
564 |
565 | // Initialize system charts
566 | function initSystemCharts() {
567 | const cpuCanvas = document.getElementById('cpu-chart');
568 | const memCanvas = document.getElementById('memory-chart');
569 |
570 | if (cpuCanvas && !systemCharts.cpu) {
571 | systemCharts.cpu = new Chart(cpuCanvas, {
572 | type: 'line',
573 | data: {
574 | labels: systemData.cpu.labels,
575 | datasets: [{
576 | data: systemData.cpu.data,
577 | borderColor: 'rgba(79, 172, 254, 0.8)',
578 | backgroundColor: 'rgba(79, 172, 254, 0.2)',
579 | borderWidth: 2,
580 | tension: 0.4,
581 | fill: true,
582 | pointRadius: 0
583 | }]
584 | },
585 | options: {
586 | responsive: true,
587 | maintainAspectRatio: false,
588 | animation: false,
589 | scales: {
590 | x: { display: false },
591 | y: { display: false, min: 0, max: 100 }
592 | },
593 | plugins: {
594 | legend: { display: false },
595 | tooltip: { enabled: false }
596 | }
597 | }
598 | });
599 | }
600 |
601 | if (memCanvas && !systemCharts.memory) {
602 | systemCharts.memory = new Chart(memCanvas, {
603 | type: 'line',
604 | data: {
605 | labels: systemData.memory.labels,
606 | datasets: [{
607 | data: systemData.memory.data,
608 | borderColor: 'rgba(79, 172, 254, 0.8)',
609 | backgroundColor: 'rgba(79, 172, 254, 0.2)',
610 | borderWidth: 2,
611 | tension: 0.4,
612 | fill: true,
613 | pointRadius: 0
614 | }]
615 | },
616 | options: {
617 | responsive: true,
618 | maintainAspectRatio: false,
619 | animation: false,
620 | scales: {
621 | x: { display: false },
622 | y: { display: false, min: 0, max: 100 }
623 | },
624 | plugins: {
625 | legend: { display: false },
626 | tooltip: { enabled: false }
627 | }
628 | }
629 | });
630 | }
631 | }
632 |
633 | // Update system info with sparklines
634 | function updateSystemInfo(systemInfo) {
635 | const cpuEl = document.getElementById('cpu-usage');
636 | const memEl = document.getElementById('memory-usage');
637 |
638 | if (cpuEl) cpuEl.textContent = `${Math.round(systemInfo.cpu_percent)}%`;
639 | if (memEl) memEl.textContent = `${Math.round(systemInfo.memory_percent)}%`;
640 |
641 | // Update system chart data
642 | const now = new Date().toLocaleTimeString();
643 |
644 | systemData.cpu.labels.push(now);
645 | systemData.cpu.data.push(systemInfo.cpu_percent);
646 | systemData.memory.labels.push(now);
647 | systemData.memory.data.push(systemInfo.memory_percent);
648 |
649 | // Keep only last 120 points (60 seconds at 0.5s interval)
650 | if (systemData.cpu.labels.length > 120) {
651 | systemData.cpu.labels.shift();
652 | systemData.cpu.data.shift();
653 | systemData.memory.labels.shift();
654 | systemData.memory.data.shift();
655 | }
656 |
657 | // Initialize charts if needed
658 | if (!systemCharts.cpu || !systemCharts.memory) {
659 | initSystemCharts();
660 | }
661 |
662 | // Update charts
663 | if (systemCharts.cpu) systemCharts.cpu.update('none');
664 | if (systemCharts.memory) systemCharts.memory.update('none');
665 | }
666 |
667 |
--------------------------------------------------------------------------------