├── ksmi ├── __init__.py ├── add_env.py ├── __main__.py ├── copy_id.py └── kairos_smi.py ├── requirements.txt ├── img └── sample.png ├── change_logs ├── v0.2.0.md ├── v0.1.3.1.md ├── v0.1.4.md └── v0.1.5.md ├── config.example.json ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── setup.py ├── LICENSE ├── tests ├── test_ssh_remote_command.py └── test_get_gpu_status.py ├── .circleci └── config.yml ├── .gitignore └── README.md /ksmi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ksmi/add_env.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | unittest-xml-reporting==2.5.1 -------------------------------------------------------------------------------- /ksmi/__main__.py: -------------------------------------------------------------------------------- 1 | from . import kairos_smi 2 | 3 | kairos_smi.main() -------------------------------------------------------------------------------- /img/sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kairos03/kairos-smi/HEAD/img/sample.png -------------------------------------------------------------------------------- /change_logs/v0.2.0.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | version : v0.2.0 3 | 4 | ## change thing 5 | Bug fix ksmi 6 | -------------------------------------------------------------------------------- /config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "hosts": [ 3 | "@[:port]", 4 | "kairos@123.123.123.123:22" 5 | ] 6 | } -------------------------------------------------------------------------------- /change_logs/v0.1.3.1.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | version : v0.1.3.1 3 | 4 | ## change thing 5 | hot fix `key error occured` when ssh command is not successfully finished. -------------------------------------------------------------------------------- /change_logs/v0.1.4.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | version : v0.1.4 3 | 4 | ## change thing 5 | Completely fixed `key error occured` when ssh command is not successfully finished. 6 | Change space between displayed text -------------------------------------------------------------------------------- /change_logs/v0.1.5.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | version : v0.1.5 3 | 4 | ## change thing 5 | To performance enhancement totally replace threading to multiprocess 6 | Divide display function from main 7 | Change context structure 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='ksmi', 5 | version='0.2.1', 6 | url='https://github.com/kairos03/kairos-smi', 7 | license='MIT', 8 | author='Eunseop Shin', 9 | author_email='kairos9603@gmail.com', 10 | description='Multi-server GPU monotoring tools', 11 | packages=find_packages(exclude=['tests', 'config.json']), 12 | long_description=open('README.md').read(), 13 | long_description_content_type="text/markdown", 14 | zip_safe=False, 15 | setup_requires=['nose>=1.0'], 16 | test_suite='nose.collector', 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | ) -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 EunSeop Shin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_ssh_remote_command.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from ksmi.kairos_smi import ssh_remote_command, QUERY_APP, QUERY_GPU 4 | 5 | class test_ssh_remote_command(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.host = "mlvc07@163.180.186.49:2222" 9 | self.wrong_host = "test@123.123.123.123:2211" 10 | self.timeout = 10 11 | 12 | def tearDown(self): 13 | pass 14 | 15 | def test_echo_success(self): 16 | # success case 17 | result = ssh_remote_command(self.host, 'echo hello; echo hi', self.timeout) 18 | self.assertEqual(result, 19 | { 20 | 'status': 'Success', 21 | 'entry': self.host, 22 | 'command': 'echo hello; echo hi', 23 | 'data': [['hello'], ['hi']] 24 | }) 25 | 26 | def test_query_success(self): 27 | # success case 28 | result = ssh_remote_command(self.host, QUERY_GPU, self.timeout) 29 | self.assertEqual(result['status'], 'Success') 30 | 31 | def test_query_fail(self): 32 | # fail case 33 | result = ssh_remote_command(self.wrong_host, 'echo hello', self.timeout) 34 | self.assertEqual(result['status'], 'Timeout') 35 | -------------------------------------------------------------------------------- /ksmi/copy_id.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import subprocess 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-n', '--new_id', action='store_true', help='generate new id_rsa key') 9 | parser.add_argument('-c', '--config', default=None, help='set config file to use host list') 10 | parser.add_argument('-s', '--server', default=None, help='set a server to copy id') 11 | args = parser.parse_args() 12 | 13 | # generate new rsa_id key 14 | if args.new_id: 15 | os.system('ssh-keygen') 16 | 17 | # set hosts 18 | hosts = [] 19 | if args.config is not None: 20 | with open(args.config, 'r') as f: 21 | conf = json.load(f) 22 | 23 | hosts.extend(conf['hosts']) 24 | 25 | if args.server is not None: 26 | hosts.append(args.server) 27 | 28 | if hosts == []: 29 | print("NO HOST TO COPY ID") 30 | exit(-1) 31 | 32 | 33 | for host in hosts: 34 | try: 35 | sp_host = host.split(':') 36 | ep, port = sp_host 37 | except KeyError: 38 | ep, port = host, 22 39 | 40 | os.system('ssh-copy-id {} -p {}'.format(ep, port)) 41 | 42 | ssh = subprocess.Popen(["ssh", "-p", port, ep, 'cat ~/.ssh/authorized_keys'], 43 | shell=False, 44 | stdout=subprocess.PIPE, 45 | stderr=subprocess.PIPE) 46 | result = ssh.stdout.readlines() 47 | if result == []: 48 | error = ssh.stderr.readlines()[0].decode('utf-8') 49 | raise Exception('SSH connection refused. {}'.format(error)) 50 | # print (sys.stderr, "ERROR: %s" % error) 51 | else: 52 | my_key = subprocess.check_output(['cat', '{}/.ssh/id_rsa.pub'.format(os.environ['HOME'])], universal_newlines=True) 53 | my_key = my_key.split(' ') 54 | for i, key in enumerate(result): 55 | result[i] = key.decode('utf-8').split(' ')[1] 56 | 57 | if my_key[1] in result: 58 | print("[OK] {}".format(host)) 59 | else: 60 | print("[Fail] {}".format(host)) 61 | -------------------------------------------------------------------------------- /tests/test_get_gpu_status.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from ksmi.kairos_smi import get_gpus_status 4 | 5 | class test_get_gpu_status(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.hosts = ["mlvc07@163.180.186.49:2222"] 9 | self.wrong_hosts = ["test@123.123.123.123:2211"] 10 | self.timeout = 10 11 | 12 | def test_get_gpu_status_success(self): 13 | # success case 14 | results = get_gpus_status(self.hosts, self.timeout) 15 | #print(results) 16 | self.assertEqual(type(results), type({})) 17 | self.assertEqual(len(results), 1) 18 | self.assertTrue(self.hosts[0] in results.keys()) 19 | for entry in results.keys(): 20 | self.assertEqual(type(results[entry]), type({})) 21 | self.assertEqual(len(results[entry]), 2) 22 | self.assertTrue('gpus' in results[entry].keys()) 23 | self.assertTrue('apps' in results[entry].keys()) 24 | self.assertTrue(len(results[entry]['gpus']) != 0) 25 | self.assertTrue(len(results[entry]['apps']) != 0) 26 | # print(results[entry]['apps']) 27 | 28 | def test_get_gpu_status_fail(self): 29 | # fail case 30 | results = get_gpus_status(self.wrong_hosts, self.timeout) 31 | self.assertEqual(type(results), type({})) 32 | self.assertEqual(len(results), 1) 33 | self.assertTrue(self.wrong_hosts[0] in results.keys()) 34 | for entry in results.keys(): 35 | #print(results) 36 | self.assertEqual(type(results[entry]), type({})) 37 | self.assertEqual(len(results[entry]), 2) 38 | self.assertTrue('gpus' in results[entry].keys()) 39 | self.assertTrue('apps' in results[entry].keys()) 40 | self.assertTrue(len(results[entry]['gpus']) == 0) 41 | self.assertTrue(len(results[entry]['apps']) == 0) 42 | #print(results[entry]['apps']) 43 | 44 | # def test_display_gpu_status(self): 45 | # result = get_gpus_status(self.hosts) 46 | # display_gpu_status(self.hosts, result) 47 | 48 | #def test_main(self): 49 | # main() 50 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | workflows: 3 | version: 2 4 | test: 5 | jobs: 6 | - test-3.4 7 | - test-3.5 8 | - test-3.6 9 | - test-3.7 10 | jobs: 11 | test-3.5: &test-template 12 | docker: 13 | - image: circleci/python:3.5 14 | 15 | working_directory: ~/repo 16 | 17 | steps: 18 | - checkout 19 | 20 | - add_ssh_keys: 21 | fingerprints: 22 | - c6:10:d3:dd:e5:98:ce:09:00:40:0f:c1:84:fb:e9:ab 23 | 24 | - run: 25 | name: ssh connetion test 26 | command: | 27 | ls ~/.ssh/ 28 | cat ~/.ssh/config 29 | ssh -o StrictHostKeyChecking=no mlvc07@163.180.186.49 -p 2222 "echo test" 30 | 31 | # Download and cache dependencies 32 | - restore_cache: 33 | keys: 34 | - v1-dependencies-{{ checksum "requirements.txt" }} 35 | # fallback to using the latest cache if no exact match is found 36 | - v1-dependencies- 37 | - run: 38 | name: install dependencies 39 | command: | 40 | python3 -m venv venv 41 | . venv/bin/activate 42 | pip install -r requirements.txt 43 | 44 | - run: 45 | name: install test requirements 46 | command: | 47 | . venv/bin/activate 48 | pip install nose 49 | 50 | - save_cache: 51 | paths: 52 | - ./venv 53 | key: v1-dependencies-{{ checksum "requirements.txt" }} 54 | # run tests 55 | - run: 56 | name: run tests 57 | command: | 58 | . venv/bin/activate 59 | nosetests -v tests/* 60 | - store_artifacts: 61 | path: test-reports 62 | destination: test-reports 63 | - store_test_results: 64 | path: test-results 65 | 66 | test-3.4: 67 | <<: *test-template 68 | docker: 69 | - image: circleci/python:3.4 70 | 71 | test-3.6: 72 | <<: *test-template 73 | docker: 74 | - image: circleci/python:3.6 75 | 76 | test-3.7: 77 | <<: *test-template 78 | docker: 79 | - image: circleci/python:3.7 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | **/config.json 126 | .vscode/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kairos-smi 2 | Multi-server gpu monitoring program 3 | 4 | [![PyPI version](https://badge.fury.io/py/ksmi.svg)](https://badge.fury.io/py/ksmi) [![CircleCI](https://circleci.com/gh/kairos03/kairos-smi.svg?style=svg)](https://circleci.com/gh/kairos03/kairos-smi) 5 | 6 | See change Log: [change log](change_logs/v0.1.3.1.md) 7 | 8 | ![sample.png](img/sample.png) 9 | 10 | ``` 11 | usage: ksmi [-h] [-l] [-c CONFIG] 12 | 13 | optional arguments: 14 | -h, --help show this help message and exit 15 | -l, --loop loop forever 16 | -c CONFIG, --config CONFIG 17 | set config file location 18 | ``` 19 | 20 | # quick start 21 | ## 1. install 22 | Install with pip 23 | ```shell 24 | $ pip install ksmi 25 | or 26 | $ pip3 install ksmi 27 | ``` 28 | 29 | ## 2. Setup config file 30 | Edit `config.json`. Add your gpu server address in `config.json`. 31 | ```json 32 | { 33 | "hosts": [ 34 | "@[:port]", 35 | "@[:port]" 36 | ] 37 | } 38 | ``` 39 | 40 | ## 3. Add rsa_id to server 41 | 42 | create new rea_id and add to your server 43 | 44 | ```shell 45 | $ python3 -m ksmi.copy_id -c config.json -n 46 | ``` 47 | 48 | ## 4. Run It! 49 | ```shell 50 | $ python3 -m ksmi -c config.json -l 51 | ``` 52 | 53 | # kairos-smi 54 | ``` 55 | usage: kairos_smi.py [-h] [-l] [-c CONFIG] 56 | 57 | optional arguments: 58 | -h, --help show this help message and exit 59 | -l, --loop loop forever 60 | -c CONFIG, --config CONFIG 61 | set config file location 62 | ``` 63 | 64 | ### Simple usage 65 | Run once 66 | ``` 67 | $ python3 -m ksmi -c config.json 68 | ``` 69 | 70 | Run forever 71 | ``` 72 | $ python3 -m ksmi -c config.json -l 73 | ``` 74 | 75 | # utils 76 | ## copy_id 77 | ``` 78 | usage: copy_id.py [-h] [-n] [-c CONFIG] [-s SERVER] 79 | 80 | optional arguments: 81 | -h, --help show this help message and exit 82 | -n, --new_id generate new id_rsa key 83 | -c CONFIG, --config CONFIG 84 | set config file to use host list 85 | -s SERVER, --server SERVER 86 | set a server to copy id 87 | ``` 88 | 89 | ## copy id(rsa) to server 90 | 91 | ### Use with `config.json` 92 | ``` 93 | $ python3 -m ksmi.copy_id -c config.json 94 | or 95 | $ python3 -m ksmi.copy_id --config config.json 96 | ``` 97 | 98 | ### Use with individual Server address 99 | ``` 100 | $ python3 -m ksmi.copy_id -s [username@][:port] 101 | or 102 | $ python3 -m ksmi.copy_id --server [username@][:port] 103 | ``` 104 | 105 | ### Make a new id_rsa 106 | ``` 107 | $ python3 -m ksmi.copy_id -n 108 | or 109 | $ python3 -m ksmi.copy_id --new_id 110 | ``` 111 | -------------------------------------------------------------------------------- /ksmi/kairos_smi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import json 5 | from multiprocessing import Process, Queue 6 | import argparse 7 | import logging 8 | 9 | logging.basicConfig(level=logging.ERROR) 10 | 11 | # querys 12 | QUERY_GPU = "nvidia-smi --query-gpu=timestamp,gpu_uuid,count,name,pstate,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader" 13 | QUERY_APP = "nvidia-smi --query-compute-apps=gpu_uuid,pid,process_name,used_memory --format=csv,noheader" 14 | 15 | 16 | def ssh_remote_command(entrypoint, command, timeout=1): 17 | 18 | def postprocessing(data): 19 | return [x.split(', ') for x in data.decode('utf-8').split('\n')[:-1]] 20 | 21 | try: 22 | host, port = entrypoint.split(':') 23 | except ValueError: 24 | host, port = entrypoint, '22' 25 | 26 | ssh = subprocess.Popen(['ssh', host, '-p', port, command], 27 | shell=False, 28 | stdout=subprocess.PIPE, 29 | stderr=subprocess.PIPE) 30 | try: 31 | out, err = ssh.communicate(timeout=timeout) 32 | #print(out, err) 33 | if err != b'': 34 | return {'status': 'Error', 'entry': entrypoint, 'command': command, 'data': postprocessing(err)} 35 | return {'status': 'Success', 'entry': entrypoint, 'command': command, 'data': postprocessing(out)} 36 | 37 | except subprocess.TimeoutExpired: 38 | ssh.kill() 39 | out, err = ssh.communicate() 40 | #print(out, err) 41 | return {'status': 'Timeout', 'entry': entrypoint, 'command': command, 'data': postprocessing(err)} 42 | 43 | 44 | def get_gpus_status(hosts, timeout=1): 45 | 46 | result = {} 47 | que = Queue(maxsize=100) 48 | procs = [] 49 | 50 | def run_command_and_inque(q, host, query): 51 | result = ssh_remote_command(host, query, timeout=timeout) 52 | q.put(result) 53 | 54 | for host in hosts: 55 | for query in [QUERY_GPU, QUERY_APP]: 56 | proc = Process(target=run_command_and_inque, args=(que, host, query)) 57 | proc.start() 58 | procs.append(proc) 59 | 60 | for proc in procs: 61 | proc.join() 62 | 63 | while not que.empty(): 64 | item = que.get() 65 | entry = item.get('entry') 66 | item_type = 'apps' if item.get('command') == QUERY_APP else 'gpus' 67 | 68 | # new entry check 69 | if entry not in result.keys(): 70 | result[entry] = {} 71 | 72 | # error data check 73 | data = {} 74 | if item['status'] == 'Success': 75 | data = item.get('data') 76 | 77 | result[entry].update({item_type: data}) 78 | 79 | que.close() 80 | 81 | return result 82 | 83 | 84 | def display_gpu_status(hosts, data): 85 | """Display gpu status 86 | """ 87 | for host in hosts: 88 | gpu_stat = data[host].get('gpus') 89 | app_stat = data[host].get('apps') 90 | 91 | # print gpu stat 92 | # if gpu stat is empty 93 | print('[{:.30}]'.format(host), end='') 94 | if gpu_stat == None or app_stat == None or len(gpu_stat) == 0: 95 | print('\n|{}|'.format(' ERROR '), end='\n') 96 | continue 97 | else: 98 | print('{:>26}'.format("Running [{:2}/{:2}]".format(len(app_stat), len(gpu_stat))), end='\n') 99 | 100 | # print apps 101 | for i, gpu in enumerate(gpu_stat): 102 | if len(gpu) != 9: 103 | continue 104 | print("| {} | Temp {:2s}C | Util {:>5s} | Mem {:>6s} / {:9s} |".format(i, gpu[5], gpu[6], gpu[7][:-4], gpu[8])) 105 | 106 | 107 | def get_args(): 108 | parser = argparse.ArgumentParser() 109 | parser.add_argument('-l', '--loop', action='store_true', help='loop forever') 110 | parser.add_argument('-c', '--config', default='config.json', help='set config file location') 111 | args = parser.parse_args() 112 | return args 113 | 114 | def main(): 115 | args = get_args() 116 | 117 | try: 118 | with open(args.config, 'r') as f: 119 | conf = json.load(f) 120 | except FileNotFoundError: 121 | print("[ERROR] Config file '{}' not found.".format(args.config)) 122 | exit() 123 | 124 | HOSTS = conf['hosts'] 125 | 126 | while(True): 127 | result = get_gpus_status(HOSTS) 128 | 129 | if args.loop: 130 | os.system('cls' if os.name == 'nt' else "printf '\033c'") 131 | 132 | logging.debug("result {}".format(result)) 133 | display_gpu_status(HOSTS, result) 134 | 135 | if not args.loop: 136 | 137 | break 138 | 139 | 140 | if __name__ == '__main__': 141 | main() 142 | --------------------------------------------------------------------------------