├── imgs ├── dash-1.png └── dash-2.png ├── MANIFEST.in ├── requirements.txt ├── gpuview ├── __main__.py ├── __init__.py ├── test_gpuview.py ├── demo.py ├── utils.py ├── app.py ├── core.py ├── service.py └── views │ └── index.tpl ├── .gitignore ├── todo.md ├── LICENSE ├── .circleci └── config.yml ├── setup.py └── README.md /imgs/dash-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgaim/gpuview/HEAD/imgs/dash-1.png -------------------------------------------------------------------------------- /imgs/dash-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgaim/gpuview/HEAD/imgs/dash-2.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include gpuview/views/index.tpl 3 | include gpuview/service.sh 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gpustat>=1.1.1 2 | bottle>=0.12.14 3 | requests>=2.25.0 4 | # testing 5 | pytest>=7.0.0 6 | pytest-cov>=4.0.0 7 | flake8>=6.0.0 8 | -------------------------------------------------------------------------------- /gpuview/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entry point for the gpuview application. 3 | """ 4 | 5 | if __name__ == "__main__": 6 | from .app import main 7 | 8 | main() 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .vscode/ 3 | .pytest_cache/ 4 | .eggs/ 5 | dist/ 6 | build/ 7 | tmp/ 8 | *.egg-info/ 9 | gpuhosts.db 10 | gpuview.log 11 | pypi.sh 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /gpuview/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The gpuview module. 3 | """ 4 | 5 | version_info = (1, 1, 1) 6 | __version__ = ".".join(str(c) for c in version_info) 7 | 8 | 9 | __all__ = ( 10 | "version_info", 11 | "__version__", 12 | ) 13 | -------------------------------------------------------------------------------- /todo.md: -------------------------------------------------------------------------------- 1 | # todo for `gpuview` 2 | 3 | - [x] Make the dashboard live with partial asynchronous updates. 4 | - [x] Install as a service daemon and enable autostart at boot time. 5 | - [x] Migrate gpuview service to `systemd` and logging via `journald`. 6 | - [ ] Access protection of dashboard and APIs 7 | - [ ] Add animated graphs to dashboard using D3.js. 8 | 9 | Contributions are very welcome! 10 | -------------------------------------------------------------------------------- /gpuview/test_gpuview.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit and integration tests for gpuview 3 | """ 4 | 5 | 6 | def test_my_gpustat(): 7 | from .core import my_gpustat 8 | 9 | stat = my_gpustat() 10 | assert stat is not None 11 | assert isinstance(stat, dict) 12 | 13 | 14 | def test_all_gpustats(): 15 | from .core import all_gpustats 16 | 17 | stats = all_gpustats() 18 | assert stats is not None 19 | assert isinstance(stats, list) 20 | 21 | 22 | def test_hosts_db(): 23 | from .core import add_host, load_hosts, remove_host 24 | 25 | dummy_host = "dummy.host" 26 | add_host(dummy_host) 27 | 28 | hosts = load_hosts() 29 | assert dummy_host in hosts 30 | 31 | remove_host(dummy_host) 32 | hosts = load_hosts() 33 | assert dummy_host not in hosts 34 | 35 | 36 | def test_arg_parser(): 37 | from .utils import arg_parser 38 | 39 | parser = arg_parser() 40 | assert parser is not None 41 | # import pytest 42 | # with pytest.raises(Exception): 43 | # parser.parse_args() 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2018 Fitsum Gaim 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | # use `-browsers` prefix for selenium tests, e.g. `3.9-browsers` 11 | - image: cimg/python:3.9 12 | 13 | # Specify service dependencies here if necessary 14 | # CircleCI maintains a library of pre-built images 15 | # documented at https://circleci.com/docs/2.0/circleci-images/ 16 | # - image: circleci/postgres:9.4 17 | 18 | working_directory: ~/repo 19 | 20 | steps: 21 | - run: 22 | name: Clone repo via HTTPS 23 | command: | 24 | git clone https://github.com/fgaim/gpuview.git ~/repo 25 | 26 | - run: 27 | name: install dependencies 28 | command: | 29 | python3 -m venv venv 30 | . venv/bin/activate 31 | pip install -r requirements.txt 32 | 33 | - save_cache: 34 | paths: 35 | - ./venv 36 | key: v1-dependencies-{{ checksum "requirements.txt" }} 37 | 38 | - run: 39 | name: run tests 40 | command: | 41 | . venv/bin/activate 42 | flake8 --exclude=venv* --max-line-length=120 --statistics 43 | pytest -v --cov=gpuview 44 | 45 | - store_artifacts: 46 | path: test-reports 47 | destination: test-reports 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from gpuview import __version__ 4 | 5 | 6 | def read_readme(): 7 | with open("README.md", encoding="utf-8") as f: 8 | return f.read() 9 | 10 | 11 | setuptools.setup( 12 | name="gpuview", 13 | version=__version__, 14 | license="MIT", 15 | description="A lightweight web dashboard for monitoring GPU usage", 16 | long_description=read_readme(), 17 | long_description_content_type="text/markdown", 18 | url="https://github.com/fgaim/gpuview", 19 | author="Fitsum Gaim", 20 | author_email="fitsum@geezlab.com", 21 | keywords="gpu web-monitoring", 22 | classifiers=[ 23 | "Development Status :: 4 - Beta", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: POSIX :: Linux", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13", 32 | "Topic :: System :: Monitoring", 33 | "Topic :: System :: Hardware", 34 | ], 35 | packages=["gpuview"], 36 | python_requires=">=3.9", 37 | install_requires=["gpustat>=1.1.1", "bottle>=0.12.25", "requests>=2.25.0"], 38 | extras_require={ 39 | "dev": ["pytest", "pytest-cov", "flake8"], 40 | "test": ["pytest", "pytest-cov"], 41 | }, 42 | entry_points={ 43 | "console_scripts": [ 44 | "gpuview=gpuview.app:main", 45 | ], 46 | }, 47 | include_package_data=True, 48 | zip_safe=False, 49 | ) 50 | -------------------------------------------------------------------------------- /gpuview/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates fake data for demoing the gpuview dashboard. 3 | """ 4 | 5 | import random 6 | import uuid 7 | from datetime import datetime 8 | from typing import Any, Dict, List 9 | 10 | 11 | def generate_fake_gpustat(hostname: str, gpu_name: str, num_gpus: int) -> Dict[str, Any]: 12 | """Generates a fake gpustat dictionary for a single host.""" 13 | gpus = [] 14 | for i in range(num_gpus): 15 | temp = random.randint(10, 90) 16 | if temp > 75: 17 | flag = "bg-danger" 18 | elif temp > 50: 19 | flag = "bg-warning" 20 | else: 21 | flag = "bg-success" 22 | 23 | mem_used = random.randint(1000, 24000) 24 | mem_total = 24576 25 | user_processes = "" 26 | for j in range(random.randint(1, 2)): 27 | process_type = random.choice(["python", "torch", "cuda", "nvprof"]) 28 | process_memory = random.randint(1000, 24000) 29 | user_processes += f"user{j}({process_type}, {process_memory}M) " 30 | gpu = { 31 | "index": f"{i}", 32 | "uuid": str(uuid.uuid4()), 33 | "name": gpu_name, 34 | "temperature.gpu": temp, 35 | "utilization.gpu": random.randint(0, 100), 36 | "power.draw": random.randint(50, 350), 37 | "enforced.power.limit": 350, 38 | "memory.used": mem_used, 39 | "memory.total": mem_total, 40 | "memory": round(mem_used / mem_total * 100), 41 | "flag": flag, 42 | "users": random.randint(0, 3), 43 | "user_processes": user_processes, 44 | } 45 | gpus.append(gpu) 46 | 47 | return { 48 | "hostname": hostname, 49 | "gpus": gpus, 50 | "query_time": datetime.now().isoformat(), 51 | } 52 | 53 | 54 | def get_demo_gpustats() -> List[Dict[str, Any]]: 55 | """Returns fresh demo data for all hosts - called each time for live refresh.""" 56 | return [ 57 | generate_fake_gpustat(hostname="demo-node-1", gpu_name="NVIDIA A100", num_gpus=4), 58 | generate_fake_gpustat(hostname="demo-node-2", gpu_name="NVIDIA H100", num_gpus=4), 59 | generate_fake_gpustat(hostname="demo-node-3", gpu_name="NVIDIA H200", num_gpus=4), 60 | generate_fake_gpustat(hostname="demo-node-4", gpu_name="NVIDIA L40S", num_gpus=4), 61 | ] 62 | 63 | 64 | def get_demo_local_gpustat() -> Dict[str, Any]: 65 | """Returns fresh demo data for local host.""" 66 | return generate_fake_gpustat(hostname="demo-node-1", gpu_name="NVIDIA A100", num_gpus=4) 67 | -------------------------------------------------------------------------------- /gpuview/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for gpuview. 3 | 4 | @author Fitsum Gaim 5 | @url https://github.com/fgaim 6 | """ 7 | 8 | import argparse 9 | 10 | from gpustat import __version__ as __gpustat__ 11 | 12 | from . import __version__ 13 | 14 | 15 | class _HelpAction(argparse._HelpAction): 16 | def __call__(self, parser, namespace, values, option_string=None): 17 | parser.print_help() 18 | subparsers_actions = [action for action in parser._actions if isinstance(action, argparse._SubParsersAction)] 19 | for subparsers_action in subparsers_actions: 20 | for choice, subparser in subparsers_action.choices.items(): 21 | print("Subparser '{}'".format(choice)) 22 | print(subparser.format_help()) 23 | parser.exit() 24 | 25 | 26 | def arg_parser(): 27 | parser = argparse.ArgumentParser(add_help=False) 28 | subparsers = parser.add_subparsers(dest="action", help="Action") 29 | 30 | base_parser = argparse.ArgumentParser(add_help=False) 31 | base_parser.add_argument("--host", default="0.0.0.0", help="IP address of host (default: 0.0.0.0)") 32 | base_parser.add_argument("--port", default=9988, help="Port number of host (default: 9988)") 33 | base_parser.add_argument("--safe-zone", action="store_true", help="Report all details including usernames") 34 | base_parser.add_argument("--exclude-self", action="store_true", help="Don't report to others but self-dashboard") 35 | 36 | run_parser = subparsers.add_parser("run", parents=[base_parser], help="Run gpuview server") 37 | run_parser.add_argument("-d", "--debug", action="store_true", help="Run server in debug mode") 38 | run_parser.add_argument("--demo", action="store_true", help="Run with fake data for testing purposes") 39 | 40 | add_parser = subparsers.add_parser("add", help="Register a new GPU host") 41 | add_parser.add_argument("--url", required=True, help="URL of GPU host (IP:Port, eg. X.X.X.X:9988") 42 | add_parser.add_argument("--name", default=None, help="An optional readable name for the GPU host") 43 | 44 | rem_parser = subparsers.add_parser("remove", help="Remove a GPU host") 45 | rem_parser.add_argument("--url", required=True, help="Url of the GPU node to remove") 46 | 47 | subparsers.add_parser("hosts", help="Print all GPU hosts") 48 | 49 | service_parser = subparsers.add_parser("service", help="Manage the gpuview systemd service. Defaults to 'start'.") 50 | service_subparsers = service_parser.add_subparsers( 51 | dest="service_command", help="Service action [start, status, stop, delete]" 52 | ) 53 | 54 | # It inherits --host, --port, etc. from base_parser 55 | start_parser = service_subparsers.add_parser( 56 | "start", parents=[base_parser], help="Install (if needed) and start the service." 57 | ) 58 | start_parser.description = ( 59 | "Installs if not already installed and starts the gpuview service. " 60 | "On first run, you can pass --host, --port, etc. " 61 | "to configure the service." 62 | ) 63 | service_subparsers.add_parser("status", help="Check service status") 64 | service_subparsers.add_parser("stop", help="Stop the service") 65 | service_subparsers.add_parser("delete", help="Stop and delete the service") 66 | service_subparsers.add_parser("logs", help="View service logs using journalctl") 67 | 68 | parser.add_argument( 69 | "-v", 70 | "--version", 71 | action="version", 72 | help="Print gpuview and gpustat versions", 73 | version="gpuview %s || gpustat %s" % (__version__, __gpustat__), 74 | ) 75 | parser.add_argument("-h", "--help", action=_HelpAction, help="Print this help message") 76 | return parser 77 | -------------------------------------------------------------------------------- /gpuview/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Web API of gpuview. 5 | 6 | @author Fitsum Gaim 7 | @url https://github.com/fgaim 8 | """ 9 | 10 | import json 11 | import os 12 | from datetime import datetime 13 | from typing import Any 14 | 15 | from bottle import TEMPLATE_PATH, Bottle, response, template 16 | 17 | from . import core, demo 18 | from . import service as service_manager 19 | from . import utils 20 | 21 | app = Bottle() 22 | abs_path = os.path.dirname(os.path.realpath(__file__)) 23 | abs_views_path = os.path.join(abs_path, "views") 24 | TEMPLATE_PATH.insert(0, abs_views_path) 25 | 26 | EXCLUDE_SELF = False # Do not report to `/gpustat` calls. 27 | DEMO_MODE = False # Run with fake data. 28 | 29 | 30 | @app.route("/") 31 | def index() -> str: 32 | if DEMO_MODE: 33 | gpustats = demo.get_demo_gpustats() 34 | else: 35 | gpustats = core.all_gpustats() 36 | now = datetime.now().strftime("Updated at %Y-%m-%d %H-%M-%S") 37 | return template("index", gpustats=gpustats, update_time=now) 38 | 39 | 40 | @app.route("/gpustat", methods=["GET"]) # deprecated alias 41 | @app.route("/api/gpustat/self", methods=["GET"]) 42 | def report_gpustat() -> str: 43 | """ 44 | Returns the gpustat of this host. 45 | See `exclude-self` option of `gpuview run`. 46 | Available at both /gpustat (legacy) and /api/gpustat/self (RESTful). 47 | """ 48 | 49 | def _date_handler(obj: Any) -> str: 50 | if hasattr(obj, "isoformat"): 51 | return obj.isoformat() 52 | else: 53 | raise TypeError(type(obj)) 54 | 55 | response.content_type = "application/json" 56 | if DEMO_MODE: 57 | resp = demo.get_demo_local_gpustat() 58 | elif EXCLUDE_SELF: 59 | resp = {"error": "Excluded self!"} 60 | else: 61 | resp = core.my_gpustat() 62 | return json.dumps(resp, default=_date_handler) 63 | 64 | 65 | @app.route("/api/gpustat/all", methods=["GET"]) 66 | def api_gpustat_all() -> str: 67 | """ 68 | Returns aggregated gpustats for all hosts (same data as index page). 69 | Used by frontend for live updates. 70 | """ 71 | 72 | def _date_handler(obj: Any) -> str: 73 | if hasattr(obj, "isoformat"): 74 | return obj.isoformat() 75 | else: 76 | raise TypeError(type(obj)) 77 | 78 | response.content_type = "application/json" 79 | if DEMO_MODE: 80 | resp = demo.get_demo_gpustats() 81 | else: 82 | resp = core.all_gpustats() 83 | return json.dumps(resp, default=_date_handler) 84 | 85 | 86 | def main() -> None: 87 | parser = utils.arg_parser() 88 | args = parser.parse_args() 89 | 90 | if "run" == args.action: 91 | core.safe_zone(args.safe_zone) 92 | global EXCLUDE_SELF, DEMO_MODE 93 | EXCLUDE_SELF = args.exclude_self 94 | DEMO_MODE = args.demo 95 | app.run(host=args.host, port=args.port, debug=args.debug) 96 | 97 | elif "service" == args.action: 98 | service_command = args.service_command or "start" 99 | if service_command == "start": 100 | service_manager.start(args) 101 | elif service_command == "status": 102 | service_manager.status(args) 103 | elif service_command == "stop": 104 | service_manager.stop(args) 105 | elif service_command == "logs": 106 | service_manager.logs(args) 107 | elif service_command == "delete": 108 | service_manager.delete(args) 109 | 110 | elif "add" == args.action: 111 | core.add_host(args.url, args.name) 112 | elif "remove" == args.action: 113 | core.remove_host(args.url) 114 | elif "hosts" == args.action: 115 | core.print_hosts() 116 | else: 117 | parser.print_help() 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /gpuview/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core functions of gpuview. 3 | 4 | @author Fitsum Gaim 5 | @url https://github.com/fgaim 6 | """ 7 | 8 | import os 9 | from typing import Any, Dict, List, Optional 10 | 11 | import requests 12 | from gpustat import GPUStatCollection 13 | 14 | ABS_PATH = os.path.dirname(os.path.realpath(__file__)) 15 | HOSTS_DB = os.path.join(ABS_PATH, "gpuhosts.db") 16 | SAFE_ZONE = False # Safe to report all details. 17 | 18 | 19 | def safe_zone(safe: bool = False) -> None: 20 | global SAFE_ZONE 21 | SAFE_ZONE = safe 22 | 23 | 24 | def my_gpustat() -> Dict[str, Any]: 25 | """ 26 | Returns a [safe] version of gpustat for this host. 27 | - See `--safe-zone` option of `gpuview start`. 28 | - Omit sensitive details, eg. uuid, username, and processes. 29 | - Set color flag based on gpu temperature: 30 | bg-warning, bg-danger, bg-success, bg-primary 31 | 32 | Returns: 33 | dict: gpustat 34 | """ 35 | 36 | try: 37 | stat = GPUStatCollection.new_query().jsonify() 38 | delete_list = [] 39 | for gpu_id, gpu in enumerate(stat["gpus"]): 40 | if type(gpu["processes"]) is str: 41 | delete_list.append(gpu_id) 42 | continue 43 | gpu["memory"] = round(float(gpu["memory.used"]) / float(gpu["memory.total"]) * 100) 44 | if SAFE_ZONE: 45 | gpu["users"] = len(set([p["username"] for p in gpu["processes"]])) 46 | user_process = [f"{p['username']}({p['command']},{p['gpu_memory_usage']}M)" for p in gpu["processes"]] 47 | gpu["user_processes"] = " ".join(user_process) 48 | else: 49 | gpu["users"] = len(set([p["username"] for p in gpu["processes"]])) 50 | processes = len(gpu["processes"]) 51 | gpu["user_processes"] = f"{gpu['users']}/{processes}" 52 | gpu.pop("processes", None) 53 | gpu.pop("uuid", None) 54 | gpu.pop("query_time", None) 55 | 56 | gpu["flag"] = "bg-primary" 57 | if gpu["temperature.gpu"] > 75: 58 | gpu["flag"] = "bg-danger" 59 | elif gpu["temperature.gpu"] > 50: 60 | gpu["flag"] = "bg-warning" 61 | elif gpu["temperature.gpu"] > 25: 62 | gpu["flag"] = "bg-success" 63 | 64 | if delete_list: 65 | for gpu_id in delete_list: 66 | stat["gpus"].pop(gpu_id) 67 | 68 | return stat 69 | except Exception as e: 70 | return {"error": f"{e}!"} 71 | 72 | 73 | def all_gpustats() -> List[Dict[str, Any]]: 74 | """ 75 | Aggregates the gpustats of all registered hosts and this host. 76 | 77 | Returns: 78 | list: pustats of hosts 79 | """ 80 | 81 | gpustats = [] 82 | mystat = my_gpustat() 83 | if "gpus" in mystat: 84 | gpustats.append(mystat) 85 | 86 | hosts = load_hosts() 87 | for url in hosts: 88 | try: 89 | response = requests.get(url + "/gpustat", timeout=10) 90 | response.raise_for_status() 91 | gpustat = response.json() 92 | if not gpustat or "gpus" not in gpustat: 93 | continue 94 | if hosts[url] != url: 95 | gpustat["hostname"] = hosts[url] 96 | gpustats.append(gpustat) 97 | except Exception as e: 98 | print(f"Error: {e} getting gpustat from {url}") 99 | 100 | try: 101 | sorted_gpustats = sorted(gpustats, key=lambda g: g["hostname"]) 102 | if sorted_gpustats is not None: 103 | return sorted_gpustats 104 | except Exception as e: 105 | print(f"Error: {e}") 106 | return gpustats 107 | 108 | 109 | def load_hosts() -> Dict[str, str]: 110 | """ 111 | Loads the list of registered gpu nodes from file. 112 | 113 | Returns: 114 | dict: {url: name, ... } 115 | """ 116 | 117 | hosts = {} 118 | if not os.path.exists(HOSTS_DB): 119 | print("There are no registered hosts! Use `gpuview add` first.") 120 | return hosts 121 | 122 | for line in open(HOSTS_DB, "r"): 123 | try: 124 | name, url = line.strip().split("\t") 125 | hosts[url] = name 126 | except Exception as e: 127 | print(f"Error: {e} loading host: {line}!") 128 | return hosts 129 | 130 | 131 | def save_hosts(hosts: Dict[str, str]) -> None: 132 | with open(HOSTS_DB, "w") as f: 133 | for url in hosts: 134 | f.write(f"{hosts[url]}\t{url}\n") 135 | 136 | 137 | def add_host(url: str, name: Optional[str] = None) -> None: 138 | url = url.strip().strip("/") 139 | if name is None: 140 | name = url 141 | hosts = load_hosts() 142 | hosts[url] = name 143 | save_hosts(hosts) 144 | print("Successfully added host!") 145 | 146 | 147 | def remove_host(url: str) -> None: 148 | hosts = load_hosts() 149 | if hosts.pop(url, None): 150 | save_hosts(hosts) 151 | print(f"Removed host: {url}!") 152 | else: 153 | print(f"Couldn't find host: {url}!") 154 | 155 | 156 | def print_hosts() -> None: 157 | hosts = load_hosts() 158 | if len(hosts): 159 | hosts = sorted(hosts.items(), key=lambda g: g[1]) 160 | print("# Name\tURL") 161 | for idx, host in enumerate(hosts): 162 | print(f"{idx + 1:02d}. {host[1]}\t{host[0]}") 163 | -------------------------------------------------------------------------------- /gpuview/service.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import os 3 | import platform 4 | import shutil 5 | import subprocess 6 | import sys 7 | import tempfile 8 | import textwrap 9 | 10 | SERVICE_NAME = "gpuview.service" 11 | SERVICE_FILE_PATH = f"/etc/systemd/system/{SERVICE_NAME}" 12 | 13 | 14 | def _run_sudo(command: list[str]): 15 | """Helper function to run a command with sudo and handle errors.""" 16 | try: 17 | print(f"Running: sudo {' '.join(command)}") 18 | subprocess.run(["sudo"] + command, check=True) 19 | except subprocess.CalledProcessError as e: 20 | print(f"Error: Command failed: 'sudo {' '.join(command)}'", file=sys.stderr) 21 | print(f"Return code: {e.returncode}", file=sys.stderr) 22 | print("Please check permissions and systemd logs.", file=sys.stderr) 23 | sys.exit(1) 24 | except FileNotFoundError: 25 | print("Error: 'sudo' command not found. Cannot manage system service.", file=sys.stderr) 26 | sys.exit(1) 27 | 28 | 29 | def _check_systemd(): 30 | """Check if we are on Linux with systemd.""" 31 | if platform.system() != "Linux": 32 | print("Error: Service installation is only supported on Linux.", file=sys.stderr) 33 | sys.exit(1) 34 | 35 | if not shutil.which("systemctl"): 36 | print("Error: Service installation only supports systemd. 'systemctl' not found.", file=sys.stderr) 37 | print("Your 'service.sh' (supervisor) script is no longer supported.", file=sys.stderr) 38 | sys.exit(1) 39 | 40 | # Check if root, if not, we need sudo 41 | if os.geteuid() != 0 and not shutil.which("sudo"): 42 | print("Error: This command requires 'sudo' to manage system services.", file=sys.stderr) 43 | sys.exit(1) 44 | 45 | 46 | def _is_installed() -> bool: 47 | """Checks if the service file already exists.""" 48 | return os.path.exists(SERVICE_FILE_PATH) 49 | 50 | 51 | def _is_active() -> bool: 52 | """Checks if the service is currently active.""" 53 | # systemctl is-active returns exit code 0 if active, non-zero otherwise 54 | result = subprocess.run(["systemctl", "is-active", "--quiet", SERVICE_NAME]) 55 | return result.returncode == 0 56 | 57 | 58 | def _install(args): 59 | """Internal function to create, install, and enable the service file.""" 60 | print(f"Installing {SERVICE_NAME}...") 61 | 62 | gpuview_path = shutil.which("gpuview") 63 | if not gpuview_path: 64 | print("Error: 'gpuview' executable not found in PATH.", file=sys.stderr) 65 | print("Please ensure gpuview is installed and your PATH is correct.", file=sys.stderr) 66 | sys.exit(1) 67 | 68 | username = getpass.getuser() 69 | arg_list = ["run"] # The 'run' command for gpuview 70 | if args.host: 71 | arg_list.append(f"--host {args.host}") 72 | if args.port: 73 | arg_list.append(f"--port {args.port}") 74 | if args.safe_zone: 75 | arg_list.append("--safe-zone") 76 | if args.exclude_self: 77 | arg_list.append("--exclude-self") 78 | 79 | run_command_args = " ".join(arg_list) 80 | service_content = textwrap.dedent(f""" 81 | [Unit] 82 | Description=GPUView Dashboard Server 83 | After=network.target 84 | 85 | [Service] 86 | User={username} 87 | ExecStart={gpuview_path} {run_command_args} 88 | Restart=always 89 | RestartSec=3 90 | 91 | [Install] 92 | WantedBy=multi-user.target 93 | """) 94 | 95 | print("--- Service File Content ---") 96 | print(service_content) 97 | print("----------------------------") 98 | 99 | try: 100 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: 101 | f.write(service_content) 102 | temp_path = f.name 103 | 104 | print("Sudo privileges are required to install the system service.") 105 | _run_sudo(["cp", temp_path, SERVICE_FILE_PATH]) 106 | _run_sudo(["systemctl", "daemon-reload"]) 107 | _run_sudo(["systemctl", "enable", SERVICE_NAME]) 108 | 109 | print(f"Service '{SERVICE_NAME}' has been installed and enabled.") 110 | print("To check the status, run: gpuview service status") 111 | finally: 112 | if "temp_path" in locals(): 113 | os.remove(temp_path) 114 | 115 | 116 | def status(args): 117 | """Checks the status of the systemd service.""" 118 | _check_systemd() 119 | if not _is_installed(): 120 | print(f"Service '{SERVICE_NAME}' is not installed.") 121 | return 122 | 123 | print(f"Checking status for {SERVICE_NAME}...") 124 | # Don't use _run_sudo, as status can return non-0 exit codes 125 | subprocess.run(["systemctl", "status", SERVICE_NAME]) 126 | 127 | 128 | def start(args): 129 | """Starts the systemd service. Installs it if not already installed.""" 130 | _check_systemd() 131 | 132 | if _is_active(): 133 | print(f"Service '{SERVICE_NAME}' is already running.") 134 | status(args) 135 | return 136 | 137 | if not _is_installed(): 138 | print("Service not found. Running first-time installation...") 139 | _install(args) 140 | 141 | print(f"Starting {SERVICE_NAME}...") 142 | _run_sudo(["systemctl", "start", SERVICE_NAME]) 143 | print("Service started.") 144 | status(args) 145 | 146 | 147 | def stop(args): 148 | """Stops the systemd service.""" 149 | _check_systemd() 150 | if not _is_installed(): 151 | print(f"Service '{SERVICE_NAME}' is not installed.") 152 | return 153 | if not _is_active(): 154 | print(f"Service '{SERVICE_NAME}' is already stopped.") 155 | return 156 | 157 | print(f"Stopping {SERVICE_NAME}...") 158 | _run_sudo(["systemctl", "stop", SERVICE_NAME]) 159 | print("Service stopped.") 160 | 161 | 162 | def logs(args): 163 | """Shows systemd service logs using journalctl.""" 164 | _check_systemd() 165 | if not _is_installed(): 166 | print(f"Service '{SERVICE_NAME}' is not installed.") 167 | return 168 | 169 | print(f"Showing logs for {SERVICE_NAME}...") 170 | print("Press Ctrl+C to exit log viewing.") 171 | try: 172 | subprocess.run(["journalctl", "-u", SERVICE_NAME, "-f"], check=True) 173 | except KeyboardInterrupt: 174 | print("Stopped viewing logs.") 175 | except subprocess.CalledProcessError as e: 176 | print(f"Error viewing logs: {e}", file=sys.stderr) 177 | print("Try running: sudo journalctl -u gpuview.service", file=sys.stderr) 178 | 179 | 180 | def delete(args): 181 | """Stops and deletes the systemd service.""" 182 | _check_systemd() 183 | if not _is_installed(): 184 | print(f"Service '{SERVICE_NAME}' is not installed.") 185 | return 186 | 187 | print(f"Deleting {SERVICE_NAME}...") 188 | try: 189 | if _is_active(): 190 | _run_sudo(["systemctl", "stop", SERVICE_NAME]) 191 | except Exception: 192 | print("Service was running, continuing with deletion.") 193 | 194 | _run_sudo(["systemctl", "disable", SERVICE_NAME]) 195 | _run_sudo(["rm", SERVICE_FILE_PATH]) 196 | _run_sudo(["systemctl", "daemon-reload"]) 197 | 198 | print(f"Service '{SERVICE_NAME}' has been stopped and deleted.") 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpuview 2 | 3 | [![LICENSE](https://img.shields.io/github/license/fgaim/gpuview.svg)](https://github.com/fgaim/gpuview/blob/master/LICENSE) 4 | ![GitHub issues](https://img.shields.io/github/issues/fgaim/gpuview.svg) 5 | [![Python Versions](https://img.shields.io/pypi/pyversions/gpuview.svg)](https://pypi.org/project/gpuview/) 6 | [![PyPI](https://img.shields.io/pypi/v/gpuview.svg)](https://pypi.org/project/gpuview/) 7 | [![CircleCI](https://circleci.com/gh/fgaim/gpuview.svg?style=shield)](https://circleci.com/gh/fgaim/gpuview) 8 | 9 | GPU is an expensive resource, and deep learning practitioners have to monitor the health and usage of their GPUs, such as the temperature, memory, utilization, and the users. This can be done with tools like `nvidia-smi` and `gpustat` from the terminal or command-line. Often times, however, it is not convenient to `ssh` into servers to just check the GPU status. `gpuview` is meant to mitigate this by running a lightweight web dashboard on top of [`gpustat`][repo_gpustat]. 10 | 11 | With `gpuview` one can monitor GPUs on the go, through a web browser. Moreover, **multiple GPU servers** can be registered into one `gpuview` dashboard and all stats are aggregated and accessible from one place. 12 | 13 | The dashboard features **live auto-refresh** (every 3 seconds) and includes interactive tooltips, temperature-based color coding, and pause/resume controls for real-time GPU monitoring. 14 | 15 | Dashboard view of nine GPUs across multiple servers: 16 | 17 | ![Screenshot of gpuview](imgs/dash-1.png) 18 | 19 | ## Setup 20 | 21 | Python 3.9 or higher is required. 22 | 23 | Install from [PyPI][pypi_gpuview]: 24 | 25 | ```sh 26 | pip install gpuview 27 | ``` 28 | 29 | [or] Install directly from repo: 30 | 31 | ```sh 32 | pip install git+https://github.com/fgaim/gpuview.git@main 33 | ``` 34 | 35 | > `gpuview` installs the latest version of `gpustat` from `pypi`, therefore, its commands are available from the terminal. 36 | 37 | ## Usage 38 | 39 | `gpuview` can be used in two modes as a temporary process or as a background service. 40 | 41 | ### Run gpuview 42 | 43 | Once `gpuview` is installed, it can be started as follows: 44 | 45 | ```sh 46 | gpuview run --safe-zone 47 | ``` 48 | 49 | This will start the dashboard at `http://0.0.0.0:9988`. 50 | 51 | By default, `gpuview` runs at `0.0.0.0` and port `9988`, but these can be changed using `--host` and `--port`. The `safe-zone` option means report all details including usernames, but it can be turned off for security reasons. 52 | 53 | For testing and development purposes, you can run gpuview with synthetic data: 54 | 55 | ```sh 56 | gpuview run --demo 57 | ``` 58 | 59 | This displays fake GPU statistics and is useful when developing on systems without NVIDIA GPUs or when showcasing the dashboard. 60 | 61 | ### Run as a Service 62 | 63 | On **Linux systems with systemd** (which is standard on most modern distributions like Ubuntu, RHEL, and Fedora), you can install `gpuview` to run as a permanent background service. This requires `sudo` privileges. 64 | 65 | **1. Install & Start the Service:** 66 | Run the `start` command. The first time you run this, it will also **install** the service. For backward compatibility, `gpuview service` (with no subcommand) defaults to `start`. 67 | 68 | ```sh 69 | # Install and start the service with defaults settings 70 | gpuview service --safe-zone 71 | 72 | # Or apply custom configurations 73 | gpuview service start [--host ] [--port ] [--safe-zone] [--exclude-self] 74 | ``` 75 | 76 | The service will be configured with the options you provide (like `--port`) and set to autostart on boot. 77 | 78 | **2. Manage the Service:** 79 | You can easily control the service with these built-in commands: 80 | 81 | * `gpuview service status`: Check if the service is running and see its recent logs. 82 | * `gpuview service logs`: View real-time service logs using journalctl. 83 | * `gpuview service stop`: Stop the background service. 84 | * `gpuview service start`: Start the service if it's been stopped (it will not re-install). 85 | * `gpuview service delete`: Stop, disable, and **uninstall** the service from your system. 86 | 87 | ### Runtime options 88 | 89 | There are a few important options in `gpuview`, use `gpuview --help` to see them all. 90 | 91 | ```sh 92 | gpuview -h 93 | ``` 94 | 95 | * `run` : Start `gpuview` dashboard server 96 | * `--host` : URL or IP address of host (default: 0.0.0.0) 97 | * `--port` : Port number to listen to (default: 9988) 98 | * `--safe-zone` : Safe to report all details, eg. usernames 99 | * `--exclude-self` : Don't report to others but to self-dashboard 100 | * `--demo` : Run with fake data for testing purposes 101 | * `-d`, `--debug` : Run server in debug mode (for developers) 102 | * `add` : Add a GPU host to the dashboard 103 | * `--url` : URL of host [IP:Port], eg. X.X.X.X:9988 104 | * `--name` : Optional readable name for the host, eg. Node101 105 | * `remove` : Remove a registered host from dashboard 106 | * `--url` : URL of host to remove, eg. X.X.X.X:9988 107 | * `hosts` : Print out all registered hosts 108 | * `service` : Manage the `gpuview` systemd service (Linux only). Defaults to 'start'. 109 | * `start` : Install (if needed) and start the service. 110 | * `--host` : (Optional) Host to bind (default: 0.0.0.0) 111 | * `--port` : (Optional) Port to bind (default: 9988) 112 | * `--safe-zone` : (Optional) Report all details, eg. usernames 113 | * `--exclude-self` : (Optional) Don't report to others 114 | * `status` : Check the status of the `gpuview` service. 115 | * `stop` : Stop the `gpuview` service. 116 | * `logs` : View service logs using journalctl. 117 | * `delete` : Stop, disable, and uninstall the service. 118 | * `-v`, `--version` : Print versions of `gpuview` and `gpustat` 119 | * `-h`, `--help` : Print help for command-line options 120 | 121 | ### Monitoring multiple hosts 122 | 123 | To aggregate the stats of multiple machines, they can be registered to one dashboard using their address and the port number running `gpustat`. 124 | 125 | Register a host to monitor as follows: 126 | 127 | ```sh 128 | gpuview add --url --name 129 | ``` 130 | 131 | Remove a registered host as follows: 132 | 133 | ```sh 134 | gpuview remove --url --name 135 | ``` 136 | 137 | Display all registered hosts/nodes as follows: 138 | 139 | ```sh 140 | gpuview hosts 141 | ``` 142 | 143 | The `gpuview` service needs to run in all hosts that will be monitored. 144 | 145 | > Tip: `gpuview` can be setup on a none GPU machine, such as laptops, to monitor remote GPU servers. 146 | 147 | ## API Endpoints 148 | 149 | gpuview provides REST API endpoints for programmatic access: 150 | 151 | * `GET /api/gpustat/self` - Returns GPU statistics for the main host 152 | * `GET /api/gpustat/all` - Returns aggregated GPU statistics for all registered hosts 153 | 154 | **Legacy endpoints:** 155 | 156 | * `GET /gpustat` - Returns GPU statistics for the local host (backward compatibility) 157 | 158 | ## Etc 159 | 160 | Helpful tips related to the underlying performance are available at the [`gpustat`][repo_gpustat] repo. 161 | 162 | For the sake of simplicity, `gpuview` does not have a user authentication in place. As a security measure, 163 | it does not report sensitive details such as user names by default. This can be changed if the service is 164 | running in a trusted network, using the `--safe-zone` option to report all details. 165 | 166 | The `--exclude-self` option of the run command can be used to prevent other dashboards from getting stats of the current machine. This way the stats are shown only on the host's own dashboard. 167 | 168 | Detailed view of GPUs across multiple servers: 169 | 170 | ![Screenshot of gpuview](imgs/dash-2.png) 171 | 172 | ## License 173 | 174 | `gpuview` is licensed under the [MIT License](LICENSE), which is a permissive open-source license that allows you to freely use, modify, and distribute this software. 175 | 176 | [repo_gpustat]: https://github.com/wookayin/gpustat 177 | [pypi_gpuview]: https://pypi.python.org/pypi/gpuview 178 | -------------------------------------------------------------------------------- /gpuview/views/index.tpl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | gpuview 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 28 |
29 |
30 |
31 | % for gpustat in gpustats: 32 | % for gpu in gpustat.get('gpus', []): 33 |
34 |
48 |
49 |
50 |
51 | 52 | {{ gpustat.get('hostname', '-') }} 53 |
54 |
55 |
56 | [{{ gpu.get('index', '') }}] {{ gpu.get('name', '-') }} 57 |
58 |
59 | 79 |
80 |
81 | % end 82 | % end 83 |
84 | 85 |
86 |
87 | All Nodes and GPUs
88 |
89 |
90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | % for gpustat in gpustats: 104 | % for gpu in gpustat.get('gpus', []): 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | % end 115 | % end 116 | 117 |
NodeDeviceTemp.Util.Memory Use/CapPower Use/CapUser Processes
{{ gpustat.get('hostname', '-') }}[{{ gpu.get('index', '') }}] {{ gpu.get('name', '-') }}{{ gpu.get('temperature.gpu', '-') }}°C{{ gpu.get('utilization.gpu', '-') }}%{{ gpu.get('memory', '-') }}% ({{ gpu.get('memory.used', '') }}/{{ gpu.get('memory.total', '-') }}){{ gpu.get('power.draw', '-') }} / {{ gpu.get('enforced.power.limit', '-') }}{{ gpu.get('user_processes', '-') }}
118 |
119 |
120 | 131 |
132 | 133 |
134 |
135 |
136 |
137 |
138 |
GPU Temperature Legend:
139 |
140 | Hot (>75°C) 141 | Warm (50-75°C) 142 | Normal (25-50°C) 143 | Cool (<25°C) 144 |
145 |
146 |
147 |
148 |
149 |
150 | 151 | 158 |
159 | 162 | 165 | 166 | 167 | 324 |
325 | 326 | 327 | 328 | --------------------------------------------------------------------------------