├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── example ├── multi_card_mnist.py ├── single_card_mnist.py └── wait.py ├── finished_queue.png ├── homepage.png ├── requirements.txt ├── setup.py ├── tests ├── __init__.py └── test_client.py ├── watchmen ├── __init__.py ├── client.py ├── listener.py ├── reminder.py ├── server.py ├── templates │ ├── index.html │ └── old_index.html └── wait.py └── working_queue.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # self-defined 132 | .vscode/ 133 | data/ 134 | .watchmen_server.pid 135 | .watchmen_server.token 136 | .watchmen_client.token 137 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Spico 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: clean 2 | python3 setup.py sdist bdist_wheel 3 | 4 | test_upload: 5 | python3 -m twine upload --verbose --repository testpypi dist/* 6 | 7 | upload: 8 | python3 -m twine upload --repository pypi dist/* 9 | 10 | clean: 11 | rm -rf build/ 12 | rm -rf dist/ 13 | rm -rf *.egg-info/ 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Watchmen 2 | A simple and easy-to-use toolkit for GPU scheduling. 3 | 4 | ## Dependencies 5 | - [Python](https://www.python.org/downloads/) >= 3.6 6 | - requests >= 2.24.0 7 | - pydantic >= 1.7.1 8 | - gpustat >= 0.6.0 9 | - flask >= 1.1.2 10 | - apscheduler >= 3.6.3 11 | 12 | ## Installation 13 | 14 | 1. Install dependencies. 15 | ```bash 16 | $ pip install -r requirements.txt 17 | ``` 18 | 19 | 2. Install watchmen. 20 | 21 | Install from source code: 22 | ```bash 23 | $ pip install -e . 24 | ``` 25 | 26 | Or you can install the stable version package from pypi. 27 | ```bash 28 | $ pip install gpu-watchmen -i https://pypi.org/simple 29 | ``` 30 | 31 | ## Quick Start 32 | 1. Start the server 33 | 34 | The default port of the server is `62333` 35 | ```bash 36 | $ python -m watchmen.server 37 | ``` 38 | 39 | If you want the server to be running backend, try: 40 | ```bash 41 | $ nohup python -m watchmen.server 1>watchmen.log 2>&1 & 42 | ``` 43 | 44 | There are some configurations for the server 45 | ``` 46 | usage: server.py [-h] [--host HOST] [--port PORT] 47 | [--queue_timeout QUEUE_TIMEOUT] 48 | [--request_interval REQUEST_INTERVAL] 49 | [--status_queue_keep_time STATUS_QUEUE_KEEP_TIME] 50 | 51 | optional arguments: 52 | -h, --help show this help message and exit 53 | --host HOST host address for api server 54 | --port PORT port for api server 55 | --queue_timeout QUEUE_TIMEOUT 56 | timeout for queue waiting (seconds) 57 | --request_interval REQUEST_INTERVAL 58 | interval for gpu status requesting (seconds) 59 | --status_queue_keep_time STATUS_QUEUE_KEEP_TIME 60 | hours for keeping the client status. set `-1` to keep all clients' status 61 | ``` 62 | 63 | 2. Modify the source code in your project: 64 | 65 | ```python 66 | from watchmen import WatchClient 67 | 68 | client = WatchClient(id="short description of this running", gpus=[1], 69 | server_host="127.0.0.1", server_port=62333) 70 | client.wait() 71 | ``` 72 | 73 | When the program goes on after `client.wait()`, you are in the working queue. 74 | Watchmen supports two requesting mode: 75 | - `queue` mode means you are waiting for the gpus in `gpus` arguments. 76 | - `schedule` mode means you are waiting for the server to spare `req_gpu_num` of available GPUs in `gpus`. 77 | You can check examples in `example/` for further reading. 78 | 79 | ```bash 80 | # single card queue mode 81 | $ cd example && python single_card_mnist.py --id="single" --cuda=0 --wait 82 | # single card schedule mode 83 | $ cd example && python single_card_mnist.py --id="single schedule" --cuda=0,2,3 --req_gpu_num=1 --wait_mode="schedule" --wait 84 | # queue mode 85 | $ cd example && python multi_card_mnist.py --id="multi" --cuda=2,3 --wait 86 | # schedule mode 87 | $ cd example && python multi_card_mnist.py --id='multi card scheduling wait' --cuda=1,0,3 --req_gpu_num=2 --wait="schedule" 88 | ``` 89 | 90 | 3. Check the queue in browser. 91 | 92 | Open the following link to your browser: `http://:`, for example: `http://192.168.126.143:62333`. 93 | 94 | And you can get a result like the demo below. 95 | Please be aware that the page is not going to change dynamically, so you can refresh the page manually to check the latest status. 96 | 97 | Home page: GPU status 98 | 99 | ![HomePage](homepage.png) 100 | 101 | Working queue: 102 | ![WorkingQueue](working_queue.png) 103 | 104 | Finished queue: 105 | ![FinishedQueue](finished_queue.png) 106 | 107 | 108 | 1. Reminder when program is finished. 109 | 110 | `watchmen` also support email and other kinds of reminders for message informing. 111 | For example, you can send yourself an email when the program is finished. 112 | 113 | ```python 114 | from watchmen.reminder import send_email 115 | 116 | ... # your code here 117 | 118 | send_email( 119 | host="smtp.163.com", # email host to login, like `smtp.163.com` 120 | port=25, # email port to login, like `25` 121 | user="***@163.com", # user email address for login, like `***@163.com` 122 | password="***", # password or auth code for login 123 | receiver="***@outlook.com", # receiver email address 124 | html_message="

Your program is finished!

", # content, html format supported 125 | subject="Proram Finished Notice" # email subject 126 | ) 127 | ``` 128 | 129 | To get more reminders, please check `watchmen/reminder.py`. 130 | 131 | ## UPDATE 132 | - v0.4.0: add token authentication 133 | - v0.3.9: add `cancel` api and button in the working queue, fix json encoding bug with higher versions of flask 134 | - v0.3.8: change `OK` status to be shown only in the finished queue, and show `ready` in the working queue. Fix severe bug when scheduling 135 | - v0.3.7: much faster due to lock free changes! fix timeout and schedule bug 136 | - v0.3.6: fix front-end api hostname bug 137 | - v0.3.5: fix front-end api port bug 138 | - v0.3.4: refreshed interface, add `register_time` field, fix `check_finished` bug 139 | - v0.3.3: fix `check_finished` bug in server end, quit the main thread if the sub-thread is quit, and remove the backend cmd in the main thread 140 | - v0.3.2: fix `WatchClient` bug 141 | - v0.3.1: change `Client` into `WatchClient`, fix `ClientCollection` and `send_email` bug 142 | - v0.3.0: support gpu scheduling, fix blank input output, fix `check_gpus_existence` 143 | - v0.2.2: fix html package data, add multi-card example 144 | 145 | ## TODO 146 | - [ ] import user authentication modules to help the working queue delete operations 147 | - [ ] read programs' pids to help reading program working status and kill tasks remotely 148 | - [ ] test and support distributed model parallel configurations (with `python -m torch.distributed.launch`) 149 | - [ ] prettify the web page and divide functions into different tabs 150 | - [ ] gpu using stats for each user and process 151 | - [x] quit the main thread if the sub-thread is quit 152 | - [x] change `Client` into `WatchClient`, in case of any ambiguity 153 | - [x] `ClientCollection/__contains__` function should not include `finished_queue`, to help the `id` releases 154 | - [x] subject bug in `reminder/send_email()` 155 | - [x] add schedule feature, so clients only have to request for a number and range of gpus, and the server will assign the gpu num to clients 156 | - [x] add reminders 157 | - [x] add webui html support 158 | - [x] add examples 159 | -------------------------------------------------------------------------------- /example/multi_card_mnist.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from: 3 | https://github.com/pytorch/examples/blob/master/mnist/main.py 4 | """ 5 | from __future__ import print_function 6 | import argparse 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | from torchvision import datasets, transforms 12 | from torch.optim.lr_scheduler import StepLR 13 | 14 | from watchmen import WatchClient, ClientMode 15 | 16 | 17 | class Net(nn.Module): 18 | def __init__(self): 19 | super(Net, self).__init__() 20 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 21 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 22 | self.dropout1 = nn.Dropout(0.25) 23 | self.dropout2 = nn.Dropout(0.5) 24 | self.fc1 = nn.Linear(9216, 128) 25 | self.fc2 = nn.Linear(128, 10) 26 | 27 | def forward(self, x): 28 | x = self.conv1(x) 29 | x = F.relu(x) 30 | x = self.conv2(x) 31 | x = F.relu(x) 32 | x = F.max_pool2d(x, 2) 33 | x = self.dropout1(x) 34 | x = torch.flatten(x, 1) 35 | x = self.fc1(x) 36 | x = F.relu(x) 37 | x = self.dropout2(x) 38 | x = self.fc2(x) 39 | output = F.log_softmax(x, dim=1) 40 | return output 41 | 42 | 43 | def train(args, model, device, train_loader, optimizer, epoch): 44 | model.train() 45 | for batch_idx, (data, target) in enumerate(train_loader): 46 | data, target = data.to(device), target.to(device) 47 | optimizer.zero_grad() 48 | output = model(data) 49 | loss = F.nll_loss(output, target) 50 | loss.backward() 51 | optimizer.step() 52 | if batch_idx % args.log_interval == 0: 53 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 54 | epoch, batch_idx * len(data), len(train_loader.dataset), 55 | 100. * batch_idx / len(train_loader), loss.item())) 56 | if args.dry_run: 57 | break 58 | 59 | 60 | def test(model, device, test_loader): 61 | model.eval() 62 | test_loss = 0 63 | correct = 0 64 | with torch.no_grad(): 65 | for data, target in test_loader: 66 | data, target = data.to(device), target.to(device) 67 | output = model(data) 68 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 69 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 70 | correct += pred.eq(target.view_as(pred)).sum().item() 71 | 72 | test_loss /= len(test_loader.dataset) 73 | 74 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 75 | test_loss, correct, len(test_loader.dataset), 76 | 100. * correct / len(test_loader.dataset))) 77 | 78 | 79 | def main(): 80 | # Training settings 81 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 82 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 83 | help='input batch size for training (default: 64)') 84 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 85 | help='input batch size for testing (default: 1000)') 86 | parser.add_argument('--epochs', type=int, default=50, metavar='N', 87 | help='number of epochs to train (default: 14)') 88 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 89 | help='learning rate (default: 1.0)') 90 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 91 | help='Learning rate step gamma (default: 0.7)') 92 | parser.add_argument('--dry-run', action='store_true', default=False, 93 | help='quickly check a single pass') 94 | parser.add_argument('--seed', type=int, default=1, metavar='S', 95 | help='random seed (default: 1)') 96 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 97 | help='how many batches to wait before logging training status') 98 | parser.add_argument('--save-model', action='store_true', default=False, 99 | help='For Saving the current Model') 100 | parser.add_argument("--id", type=str, default="id", 101 | help="identifier") 102 | parser.add_argument("--cuda", type=str, 103 | help="cuda device") 104 | parser.add_argument("--req_gpu_num", type=int, default=0, 105 | help="request gpu number if is `schedule` mode") 106 | parser.add_argument("--wait", type=str, default="queue", 107 | choices=["queue", "schedule", "none"], 108 | help="wait for watchmen signal") 109 | args = parser.parse_args() 110 | torch.manual_seed(args.seed) 111 | 112 | cudas = sorted(list(map(int, args.cuda.split(',')))) 113 | device = torch.device(f"cuda:{cudas[0]}") 114 | 115 | """WATCHMEN""" 116 | if args.wait == "queue": 117 | # queue wait 118 | client = WatchClient(id=f"mnist multi card {args.id} cuda={args.cuda}", gpus=cudas, 119 | server_host="127.0.0.1", server_port=62333) 120 | elif args.wait == "schedule": 121 | # scheduling wait 122 | client = WatchClient(id=f"mnist multi card {args.id} cuda={args.cuda}", 123 | gpus=cudas, mode=ClientMode.SCHEDULE, req_gpu_num=args.req_gpu_num, 124 | server_host="127.0.0.1", server_port=62334) 125 | device_ids = client.wait() 126 | """END OF WATCHMEN""" 127 | 128 | train_kwargs = {'batch_size': args.batch_size} 129 | test_kwargs = {'batch_size': args.test_batch_size} 130 | 131 | cuda_kwargs = {'num_workers': 1, 132 | 'pin_memory': True, 133 | 'shuffle': True} 134 | train_kwargs.update(cuda_kwargs) 135 | test_kwargs.update(cuda_kwargs) 136 | 137 | transform=transforms.Compose([ 138 | transforms.ToTensor(), 139 | transforms.Normalize((0.1307,), (0.3081,)) 140 | ]) 141 | 142 | dataset1 = datasets.MNIST('../data', train=True, download=True, 143 | transform=transform) 144 | dataset2 = datasets.MNIST('../data', train=False, 145 | transform=transform) 146 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) 147 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 148 | 149 | model = Net() 150 | model = nn.DataParallel(model, device_ids=device_ids) 151 | model.to(device) 152 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 153 | 154 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 155 | for epoch in range(1, args.epochs + 1): 156 | train(args, model, device, train_loader, optimizer, epoch) 157 | test(model, device, test_loader) 158 | scheduler.step() 159 | 160 | if args.save_model: 161 | torch.save(model.module.state_dict(), "mnist_cnn.pt") 162 | 163 | 164 | if __name__ == '__main__': 165 | main() 166 | -------------------------------------------------------------------------------- /example/single_card_mnist.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from: 3 | https://github.com/pytorch/examples/blob/master/mnist/main.py 4 | """ 5 | from __future__ import print_function 6 | 7 | import sys 8 | import argparse 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torchvision import datasets, transforms 15 | from torch.optim.lr_scheduler import StepLR 16 | 17 | from watchmen import WatchClient 18 | from watchmen.client import ClientMode 19 | 20 | 21 | class Net(nn.Module): 22 | def __init__(self): 23 | super(Net, self).__init__() 24 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 25 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 26 | self.dropout1 = nn.Dropout(0.25) 27 | self.dropout2 = nn.Dropout(0.5) 28 | self.fc1 = nn.Linear(9216, 128) 29 | self.fc2 = nn.Linear(128, 10) 30 | 31 | def forward(self, x): 32 | x = self.conv1(x) 33 | x = F.relu(x) 34 | x = self.conv2(x) 35 | x = F.relu(x) 36 | x = F.max_pool2d(x, 2) 37 | x = self.dropout1(x) 38 | x = torch.flatten(x, 1) 39 | x = self.fc1(x) 40 | x = F.relu(x) 41 | x = self.dropout2(x) 42 | x = self.fc2(x) 43 | output = F.log_softmax(x, dim=1) 44 | return output 45 | 46 | 47 | def train(args, model, device, train_loader, optimizer, epoch): 48 | model.train() 49 | for batch_idx, (data, target) in enumerate(train_loader): 50 | data, target = data.to(device), target.to(device) 51 | optimizer.zero_grad() 52 | output = model(data) 53 | loss = F.nll_loss(output, target) 54 | loss.backward() 55 | optimizer.step() 56 | if batch_idx % args.log_interval == 0: 57 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 58 | epoch, batch_idx * len(data), len(train_loader.dataset), 59 | 100. * batch_idx / len(train_loader), loss.item())) 60 | if args.dry_run: 61 | break 62 | 63 | 64 | def test(model, device, test_loader): 65 | model.eval() 66 | test_loss = 0 67 | correct = 0 68 | with torch.no_grad(): 69 | for data, target in test_loader: 70 | data, target = data.to(device), target.to(device) 71 | output = model(data) 72 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 73 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 74 | correct += pred.eq(target.view_as(pred)).sum().item() 75 | 76 | test_loss /= len(test_loader.dataset) 77 | 78 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 79 | test_loss, correct, len(test_loader.dataset), 80 | 100. * correct / len(test_loader.dataset))) 81 | 82 | 83 | def main(): 84 | # Training settings 85 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 86 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 87 | help='input batch size for training (default: 64)') 88 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 89 | help='input batch size for testing (default: 1000)') 90 | parser.add_argument('--epochs', type=int, default=50, metavar='N', 91 | help='number of epochs to train (default: 14)') 92 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 93 | help='learning rate (default: 1.0)') 94 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 95 | help='Learning rate step gamma (default: 0.7)') 96 | parser.add_argument('--dry-run', action='store_true', default=False, 97 | help='quickly check a single pass') 98 | parser.add_argument('--seed', type=int, default=1, metavar='S', 99 | help='random seed (default: 1)') 100 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 101 | help='how many batches to wait before logging training status') 102 | parser.add_argument('--save-model', action='store_true', default=False, 103 | help='For Saving the current Model') 104 | parser.add_argument("--id", type=str, default="id", 105 | help="identifier") 106 | parser.add_argument("--cuda", type=str, default="0", 107 | help="cuda devices, seperated by `,` with no spaces") 108 | parser.add_argument("--wait", action="store_true", 109 | help="wait for watchmen signal") 110 | parser.add_argument("--wait_mode", type=str, 111 | choices=["queue", "schedule"], default="queue", 112 | help="gpu waiting mode") 113 | args = parser.parse_args() 114 | torch.manual_seed(args.seed) 115 | 116 | """WATCHMEN""" 117 | if args.wait: 118 | if args.wait_mode == 'queue': 119 | waiting_mode = ClientMode.QUEUE 120 | else: 121 | waiting_mode = ClientMode.SCHEDULE 122 | client = WatchClient(id=f"mnist single card {args.id} cuda={args.cuda}", 123 | gpus=eval(f"[{args.cuda}]"), 124 | req_gpu_num=1, mode=waiting_mode, 125 | server_host="127.0.0.1", server_port=62333) 126 | # client.register() 127 | available_gpus = [] 128 | available_gpus = client.wait() 129 | if len(available_gpus) <= 0: 130 | sys.exit(1) 131 | else: 132 | device = torch.device(f"cuda:{available_gpus[0]}") 133 | """END OF WATCHMEN""" 134 | 135 | train_kwargs = {'batch_size': args.batch_size} 136 | test_kwargs = {'batch_size': args.test_batch_size} 137 | 138 | cuda_kwargs = {'num_workers': 1, 139 | 'pin_memory': True, 140 | 'shuffle': True} 141 | train_kwargs.update(cuda_kwargs) 142 | test_kwargs.update(cuda_kwargs) 143 | 144 | transform=transforms.Compose([ 145 | transforms.ToTensor(), 146 | transforms.Normalize((0.1307,), (0.3081,)) 147 | ]) 148 | 149 | dataset1 = datasets.MNIST('../data', train=True, download=True, 150 | transform=transform) 151 | dataset2 = datasets.MNIST('../data', train=False, 152 | transform=transform) 153 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) 154 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 155 | 156 | model = Net().to(device) 157 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 158 | 159 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 160 | for epoch in range(1, args.epochs + 1): 161 | train(args, model, device, train_loader, optimizer, epoch) 162 | test(model, device, test_loader) 163 | scheduler.step() 164 | 165 | if args.save_model: 166 | torch.save(model.state_dict(), "mnist_cnn.pt") 167 | 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /example/wait.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import torch 5 | 6 | from watchmen import WatchClient 7 | from watchmen.client import ClientMode 8 | 9 | 10 | def main(): 11 | # Training settings 12 | parser = argparse.ArgumentParser(description='Minimal GPU Scheduling Example') 13 | parser.add_argument("--id", type=str, default="id", 14 | help="identifier") 15 | parser.add_argument("--cuda", type=str, default="0", 16 | help="cuda devices, seperated by `,` with no spaces") 17 | parser.add_argument("--wait", action="store_true", 18 | help="wait for watchmen signal") 19 | parser.add_argument("--wait_mode", type=str, 20 | choices=["queue", "schedule"], default="queue", 21 | help="gpu waiting mode") 22 | parser.add_argument("--req_gpu_num", type=int, default=1, 23 | help="number of GPUs to request") 24 | parser.add_argument("--token", type=str, default="", 25 | help="authentication token") 26 | args = parser.parse_args() 27 | 28 | """WATCHMEN""" 29 | if args.wait: 30 | if args.wait_mode == 'queue': 31 | waiting_mode = ClientMode.QUEUE 32 | else: 33 | waiting_mode = ClientMode.SCHEDULE 34 | client = WatchClient(id=f"mnist single card {args.id} cuda={args.cuda}", 35 | gpus=eval(f"[{args.cuda}]"), 36 | req_gpu_num=args.req_gpu_num, mode=waiting_mode, 37 | server_host="127.0.0.1", server_port=62333, 38 | token=args.token) 39 | # client.register() 40 | available_gpus = [] 41 | available_gpus = client.wait() 42 | if len(available_gpus) <= 0: 43 | sys.exit(1) 44 | else: 45 | device = torch.device(f"cuda:{available_gpus[0]}") 46 | """END OF WATCHMEN""" 47 | print(f"Using GPU: {device}") 48 | input("Press Enter to continue...") 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /finished_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/finished_queue.png -------------------------------------------------------------------------------- /homepage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/homepage.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler>=3.6.3 2 | Flask>=1.1.2 3 | gpustat>=0.6.0 4 | pydantic>=1.7.1 5 | requests>=2.24.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from watchmen import __version__ 4 | 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | setuptools.setup( 10 | name='gpu-watchmen', 11 | version=__version__, 12 | author="Tong Zhu", 13 | author_email="tzhu1997@outlook.com", 14 | description="watchmen for GPU scheduling", 15 | long_description_content_type="text/markdown", 16 | long_description=long_description, 17 | url="https://github.com/Spico197/watchmen", 18 | packages=[ 19 | "watchmen" 20 | ], 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: POSIX :: Linux" 25 | ], 26 | python_requires='>=3.6', 27 | install_requires=[ 28 | "apscheduler>=3.6.3", 29 | "flask>=1.1.2", 30 | "gpustat>=0.6.0", 31 | "pydantic>=1.7.1", 32 | "requests>=2.24.0", 33 | ], 34 | package_data={ 35 | 'watchmen' : [ 36 | 'templates/*.html' 37 | ], 38 | }, 39 | include_package_data=True, 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | from watchmen.client import ClientMode 2 | 3 | 4 | def test_in_mode_method(): 5 | assert ClientMode.has_value("queue") is True 6 | -------------------------------------------------------------------------------- /watchmen/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import WatchClient 2 | from .client import ClientMode 3 | 4 | __version__ = "0.4.0" 5 | -------------------------------------------------------------------------------- /watchmen/client.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | import datetime 4 | import getpass 5 | import os 6 | from enum import Enum 7 | from typing import List, Optional 8 | from collections import OrderedDict 9 | 10 | import requests 11 | from pydantic import BaseModel 12 | 13 | from watchmen.listener import check_gpus_existence, check_req_gpu_num 14 | 15 | 16 | logger = logging.getLogger("common") 17 | logger.setLevel(logging.INFO) # Change to INFO to see info messages 18 | 19 | # Add a handler if none exists 20 | if not logger.handlers: 21 | console_handler = logging.StreamHandler() 22 | console_handler.setLevel(logging.INFO) 23 | formatter = logging.Formatter( 24 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 25 | ) 26 | console_handler.setFormatter(formatter) 27 | logger.addHandler(console_handler) 28 | 29 | TOKEN_FILE = ".watchmen_client.token" 30 | 31 | 32 | class ClientStatus(str, Enum): 33 | WAITING = "waiting" 34 | TIMEOUT = "timeout" 35 | READY = "ready" 36 | OK = "ok" 37 | CANCELLED = "cancelled" 38 | 39 | 40 | class ClientMode(str, Enum): 41 | QUEUE = "queue" 42 | SCHEDULE = "schedule" 43 | 44 | @classmethod 45 | def has_value(cls, value): 46 | return value in set(cls._member_map_.values()) 47 | 48 | 49 | class ClientModel(BaseModel): 50 | id: str # identifier in string format 51 | # `queue` (wait for specific gpus) or `schedule` (schedule by the server automatically) 52 | mode: Optional[ClientMode] = ClientMode.QUEUE 53 | register_time: Optional[datetime.datetime] = None # datetime.datetime 54 | last_request_time: Optional[datetime.datetime] = None # datetime.datetime 55 | status: Optional[ClientStatus] = ClientStatus.WAITING # `waiting`, `timeout`, `ok` 56 | queue_num: Optional[int] = 0 # queue number 57 | # `queue` mode: gpus for requesting to run on; `schedule` mode: available gpu scope. 58 | gpus: Optional[List[int]] = [] 59 | msg: Optional[str] = "" # error or status message 60 | req_gpu_num: Optional[int] = 0 # `schedule` mode: how many gpus are requested 61 | available_gpus: Optional[List[int]] = [] 62 | 63 | 64 | class ClientCollection(object): 65 | def __init__(self): 66 | self.work_queue = OrderedDict() # only `ok` and `waiting` 67 | self.finished_queue = OrderedDict() 68 | 69 | def mark_finished(self, client_id: str): 70 | self.finished_queue[client_id] = self.work_queue[client_id] 71 | self.work_queue.pop(client_id) 72 | 73 | def get_all_clients(self): 74 | all_clients = [] 75 | all_clients.extend(list(self.finished_queue.values())) 76 | all_clients.sort(key=lambda x: x.last_request_time) 77 | all_clients.extend(list(self.work_queue.values())) 78 | return all_clients 79 | 80 | def __getitem__(self, index: str): 81 | if index in self.work_queue: 82 | return self.work_queue[index] 83 | else: 84 | raise IndexError(f"index: {index} does not exist or has finished") 85 | 86 | def __contains__(self, index: str): 87 | return index in self.work_queue 88 | 89 | 90 | def load_token_from_file(): 91 | """Load authentication token from file if it exists.""" 92 | if os.path.exists(TOKEN_FILE): 93 | with open(TOKEN_FILE, "r") as f: 94 | return f.read().strip() 95 | return None 96 | 97 | 98 | def save_token_to_file(token): 99 | """Save authentication token to file.""" 100 | with open(TOKEN_FILE, "w") as f: 101 | f.write(token) 102 | 103 | 104 | class WatchClient(object): 105 | def __init__( 106 | self, 107 | id: str, 108 | gpus: List[int], 109 | server_host: str, 110 | server_port: int, 111 | mode: Optional[ClientMode] = ClientMode.QUEUE, 112 | req_gpu_num: Optional[int] = 0, 113 | timeout: Optional[int] = 60, 114 | token: Optional[str] = None, 115 | ): 116 | self.base_url = f"http://{server_host}:{server_port}" 117 | self.id = f"{getpass.getuser()}@{id}" 118 | if self._validate_gpus(gpus): 119 | self.gpus = gpus 120 | else: 121 | raise ValueError("Check the GPU existence") 122 | if not self._validate_mode(mode): 123 | raise ValueError(f"Check the mode: {mode}") 124 | self.mode = mode 125 | if self.mode == ClientMode.SCHEDULE: 126 | if not self._validate_req_gpu_num(req_gpu_num): 127 | raise ValueError(f"Check the `req_gpu_num`: {req_gpu_num}") 128 | self.req_gpu_num = req_gpu_num 129 | self.timeout = timeout 130 | 131 | # Handle token authentication 132 | self.token = token 133 | if not self.token: 134 | logger.info(f"No token provided, trying to load from file {TOKEN_FILE}") 135 | self.token = load_token_from_file() 136 | if self.token: 137 | logger.info(f"Dump token to file {TOKEN_FILE}") 138 | save_token_to_file(self.token) 139 | else: 140 | logger.info("No token provided, and no token file found") 141 | 142 | def _validate_gpus(self, gpus: List[int]): 143 | return check_gpus_existence(gpus) 144 | 145 | def _validate_mode(self, mode: ClientMode): 146 | return ClientMode.has_value(mode) 147 | 148 | def _validate_req_gpu_num(self, req_gpu_num: int): 149 | return check_req_gpu_num(req_gpu_num) 150 | 151 | def _get_headers(self): 152 | """Get request headers with authentication token if available.""" 153 | headers = {"Content-Type": "application/json"} 154 | if self.token: 155 | headers["X-Auth-Token"] = self.token 156 | return headers 157 | 158 | def register(self): 159 | data = { 160 | "id": self.id, 161 | "gpus": self.gpus, 162 | "mode": self.mode, 163 | "req_gpu_num": self.req_gpu_num, 164 | } 165 | result = requests.post( 166 | self.base_url + "/client/register", 167 | json=data, 168 | headers=self._get_headers(), 169 | timeout=self.timeout, 170 | ) 171 | result = result.json() 172 | if result["status"] != "ok": 173 | raise RuntimeError(f"err registering: {result['msg']}") 174 | 175 | def ping(self): 176 | data = {"id": self.id} 177 | result = requests.post( 178 | self.base_url + "/client/ping", 179 | json=data, 180 | headers=self._get_headers(), 181 | timeout=self.timeout, 182 | ).json() 183 | if result["status"] != "ok": 184 | raise RuntimeError(f"err registering: {result['msg']}") 185 | else: 186 | if result["msg"] == ClientStatus.WAITING: 187 | return False, result["available_gpus"] 188 | elif result["msg"] == ClientStatus.READY: 189 | return True, result["available_gpus"] 190 | elif result["msg"] == ClientStatus.OK: 191 | logger.warning("Status is OK, which has finished requesting GPUs.") 192 | return False, result["available_gpus"] 193 | elif result["msg"] == ClientStatus.TIMEOUT: 194 | raise RuntimeError("status changed to TIMEOUT") 195 | elif result["msg"] == ClientStatus.CANCELLED: 196 | raise RuntimeError("client has been cancelled") 197 | 198 | def wait(self): 199 | self.register() 200 | flag = False 201 | available_gpus = [] 202 | while not flag: 203 | flag, available_gpus = self.ping() 204 | time.sleep(10) 205 | return available_gpus 206 | -------------------------------------------------------------------------------- /watchmen/listener.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | from gpustat.core import GPUStatCollection 5 | 6 | 7 | def is_single_gpu_totally_free(gpu_index: int): 8 | gs = GPUStatCollection.new_query() 9 | 10 | if not isinstance(gpu_index, int): 11 | raise ValueError(f"gpu_index: {gpu_index} is not int") 12 | if gpu_index >= len(gs.gpus) or gpu_index < 0: 13 | raise ValueError(f"gpu_index: {gpu_index} does not exist") 14 | 15 | gpu = gs.gpus[gpu_index] 16 | if len(gpu.processes) <= 0 \ 17 | and gpu.utilization <= 10 \ 18 | and (float(gpu.memory_used) / float(gpu.memory_total) <= 1e-3 or gpu.memory_used < 50): 19 | return True 20 | else: 21 | return False 22 | 23 | 24 | def check_gpus_existence(gpus: List[int]): 25 | gs = GPUStatCollection.new_query() 26 | for gpu in gpus: 27 | try: 28 | gs.gpus[gpu] 29 | except KeyError: 30 | return False 31 | return True 32 | 33 | 34 | def check_req_gpu_num(req_gpu_num: int): 35 | gs = GPUStatCollection.new_query() 36 | return req_gpu_num <= len(gs.gpus) 37 | 38 | 39 | class GPUInfo(object): 40 | def __init__(self): 41 | self.gpus = [] 42 | self.new_query() 43 | 44 | def new_query(self): 45 | gs = GPUStatCollection.new_query() 46 | self.gpus = gs.gpus 47 | self.gs = gs 48 | 49 | def _is_totally_free(self, gpu_index: int): 50 | self.new_query() 51 | gpu = self.gpus[gpu_index] 52 | if len(gpu.processes) <= 0 \ 53 | and gpu.utilization <= 10 \ 54 | and (float(gpu.memory_used) / float(gpu.memory_total) <= 1e-3 or gpu.memory_used < 50): 55 | return True 56 | else: 57 | return False 58 | 59 | def is_gpus_available(self, gpus: List[int]): 60 | stts = [] 61 | for gpu in gpus: 62 | stts.append(self._is_totally_free(gpu)) 63 | return all(stts) 64 | 65 | def get_available_gpus_in_scope(self, gpu_scope: List[int]): 66 | available_gpus = [] 67 | for gpu in gpu_scope: 68 | if self._is_totally_free(gpu): 69 | available_gpus.append(gpu) 70 | return available_gpus 71 | 72 | def is_req_gpu_num_satisfied(self, gpu_scope: List[int], req_gpu_num: int): 73 | ok = False 74 | available_gpus = self.get_available_gpus_in_scope(gpu_scope) 75 | if req_gpu_num <= len(available_gpus): 76 | available_gpus = available_gpus[:req_gpu_num] 77 | ok = True 78 | return ok, available_gpus 79 | 80 | def __getitem__(self, index: int): 81 | return self._is_totally_free(index) 82 | 83 | def __str__(self): 84 | tmp = self.gs.jsonify() 85 | tmp["query_time"] = str(tmp["query_time"]) 86 | return json.dumps(tmp, indent=2, ensure_ascii=False) 87 | -------------------------------------------------------------------------------- /watchmen/reminder.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import hmac 4 | import base64 5 | import urllib 6 | import smtplib 7 | from email.mime.multipart import MIMEMultipart 8 | from email.mime.text import MIMEText 9 | from typing import Optional, List 10 | 11 | import requests 12 | 13 | 14 | def send_email( 15 | host: str, # email host to login, like `smtp.163.com` 16 | port: int, # email port to login, like `25` 17 | user: str, # user email address for login, like `***@163.com` 18 | password: str, # password or auth code for login 19 | receiver: str, # receiver email address 20 | html_message: str, # content, html format supported 21 | subject: Optional[str] = "Notice" # email subject 22 | ): 23 | # set up the SMTP server 24 | s = smtplib.SMTP(host=host, port=port) 25 | s.starttls() 26 | s.login(user, password) 27 | 28 | msg = MIMEMultipart() # create a message 29 | msg['From'] = user 30 | msg['To'] = receiver 31 | msg['Subject'] = subject 32 | msg.attach(MIMEText(html_message, 'html')) 33 | s.send_message(msg) 34 | del msg 35 | # Terminate the SMTP session and close the connection 36 | s.quit() 37 | 38 | 39 | def send_dingtalk_msg( 40 | dingtalk_user_mentions: List[str], # which user to mention, like `[183********]` 41 | dingtalk_secret: str, # like SEc1f**** 42 | dingtalk_webhook_url: str, # like `https://oapi.dingtalk.com/robot/send?access_token=***` 43 | message: str # message content 44 | ): 45 | r""" 46 | Reference: 47 | - https://github.com/huggingface/knockknock 48 | """ 49 | msg_template = { 50 | "msgtype": "text", 51 | "text": { 52 | "content": message 53 | }, 54 | "at": { 55 | "atMobiles": dingtalk_user_mentions, 56 | "isAtAll": False 57 | } 58 | } 59 | ''' 60 | construct_encrypted_url 61 | Visit https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq for details 62 | ''' 63 | timestamp = round(datetime.datetime.now().timestamp() * 1000) 64 | secret_enc = dingtalk_secret.encode('utf-8') 65 | string_to_sign = '{}\n{}'.format(timestamp, dingtalk_secret) 66 | string_to_sign_enc = string_to_sign.encode('utf-8') 67 | hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() 68 | sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) 69 | encrypted_url = dingtalk_webhook_url + '×tamp={}'\ 70 | .format(timestamp) + '&sign={}'.format(sign) 71 | postto = encrypted_url 72 | requests.post(postto, json=msg_template) 73 | -------------------------------------------------------------------------------- /watchmen/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import queue 4 | import logging 5 | import readline # noqa: F401 6 | import datetime 7 | import argparse 8 | import threading 9 | import secrets 10 | from functools import wraps 11 | 12 | from flask import Flask, jsonify, request, render_template, make_response, session 13 | from flask.json.provider import DefaultJSONProvider 14 | from apscheduler.schedulers.blocking import BlockingScheduler 15 | 16 | from watchmen.listener import ( 17 | is_single_gpu_totally_free, 18 | check_gpus_existence, 19 | check_req_gpu_num, 20 | GPUInfo, 21 | ) 22 | from watchmen.client import ClientStatus, ClientMode, ClientModel, ClientCollection 23 | 24 | 25 | apscheduler_logger = logging.getLogger("apscheduler") 26 | apscheduler_logger.setLevel(logging.ERROR) 27 | logger = logging.getLogger("common") 28 | logger.setLevel(logging.INFO) 29 | fmt = "[%(asctime)-15s]-%(levelname)s-%(filename)s-%(lineno)d-%(process)d: %(message)s" 30 | datefmt = "%a %d %b %Y %H:%M:%S" 31 | formatter = logging.Formatter(fmt, datefmt) 32 | stream_handler = logging.StreamHandler(sys.stdout) 33 | stream_handler.setFormatter(formatter) 34 | logger.addHandler(stream_handler) 35 | 36 | app = Flask("watchmen.server") 37 | app.secret_key = os.environ.get("WATCHMEN_SECRET_KEY", secrets.token_hex(32)) 38 | gpu_queue = queue.Queue() 39 | gpu_info = GPUInfo() 40 | gpu_queue.put(1) 41 | client_queue = queue.Queue() 42 | cc = ClientCollection() 43 | client_queue.put(1) 44 | 45 | APP_PORT = None 46 | AUTH_TOKEN = None 47 | PID_FILE = ".watchmen_server.pid" 48 | TOKEN_FILE = ".watchmen_server.token" 49 | 50 | 51 | class CustomJSONProvider(DefaultJSONProvider): 52 | def default(self, obj): 53 | try: 54 | if isinstance(obj, datetime.datetime): 55 | return obj.strftime("%Y-%m-%d %H:%M:%S") 56 | iterable = iter(obj) 57 | except TypeError: 58 | pass 59 | else: 60 | return list(iterable) 61 | return DefaultJSONProvider.default(self, obj) 62 | 63 | 64 | app.json_provider_class = CustomJSONProvider 65 | 66 | 67 | def load_token_from_file(): 68 | """Load authentication token from file if it exists.""" 69 | if os.path.exists(TOKEN_FILE): 70 | with open(TOKEN_FILE, "r") as f: 71 | return f.read().strip() 72 | return None 73 | 74 | 75 | def save_token_to_file(token): 76 | """Save authentication token to file.""" 77 | with open(TOKEN_FILE, "w") as f: 78 | f.write(token) 79 | 80 | 81 | def generate_token(): 82 | """Generate a random token.""" 83 | return secrets.token_hex(16) # 32 character hex string 84 | 85 | 86 | def login_required(f): 87 | """Decorator to require authentication for a route.""" 88 | 89 | @wraps(f) 90 | def decorated_function(*args, **kwargs): 91 | if not AUTH_TOKEN: 92 | return f(*args, **kwargs) # No authentication required 93 | 94 | # Check session 95 | if session.get("authenticated"): 96 | return f(*args, **kwargs) 97 | 98 | # Check token in headers or query parameters 99 | token = request.headers.get("X-Auth-Token") or request.args.get("token") 100 | if token and token == AUTH_TOKEN: 101 | session["authenticated"] = True 102 | return f(*args, **kwargs) 103 | 104 | return jsonify({"status": "err", "msg": "Authentication required"}), 401 105 | 106 | return decorated_function 107 | 108 | 109 | @app.route("/auth", methods=["POST"]) 110 | def authenticate(): 111 | """Authenticate with token.""" 112 | if not AUTH_TOKEN: 113 | return jsonify({"status": "ok", "msg": "No authentication required"}) 114 | 115 | data = request.get_json() 116 | if not data or "token" not in data: 117 | return jsonify({"status": "err", "msg": "Token required"}), 400 118 | 119 | if data["token"] == AUTH_TOKEN: 120 | session["authenticated"] = True 121 | return jsonify({"status": "ok", "msg": "Authentication successful"}) 122 | else: 123 | return jsonify({"status": "err", "msg": "Invalid token"}), 401 124 | 125 | 126 | @app.route("/gpu/") 127 | @login_required 128 | def get_single_gpu_status(gpu_id: int): 129 | status = "" 130 | msg = "" 131 | try: 132 | msg = is_single_gpu_totally_free(gpu_id) 133 | status = "ok" 134 | except ValueError as err: 135 | msg = str(err) 136 | status = "err" 137 | return jsonify({"status": status, "msg": msg}) 138 | 139 | 140 | @app.route("/gpus/") 141 | @login_required 142 | def get_gpus_status(gpu_ids: str): 143 | status = "err" 144 | msg = "" 145 | detail = [] 146 | try: 147 | gpu_ids = sorted(map(int, gpu_ids.split(","))) 148 | for gpu_id in gpu_ids: 149 | detail.append({"gpu": gpu_id, "status": is_single_gpu_totally_free(gpu_id)}) 150 | if all(detail): 151 | msg = True 152 | else: 153 | msg = False 154 | status = "ok" 155 | except ValueError as err: 156 | msg = str(err) 157 | status = "err" 158 | return jsonify({"status": status, "msg": msg, "detail": detail}) 159 | 160 | 161 | @app.route("/client/ping", methods=["POST"]) 162 | @login_required 163 | def client_ping(): 164 | client_info = ClientModel(**request.json) 165 | status = "" 166 | available_gpus = [] 167 | msg = "" 168 | client_id = client_info.id 169 | if client_id in cc: 170 | cc[client_id].last_request_time = datetime.datetime.now() 171 | status = "ok" 172 | available_gpus = cc[client_id].available_gpus 173 | msg = cc[client_id].status 174 | elif client_id in cc.finished_queue: 175 | status = "ok" 176 | available_gpus = cc.finished_queue[client_id].available_gpus 177 | msg = cc.finished_queue[client_id].status 178 | else: 179 | status = "err" 180 | msg = "client not registered or has been cancelled" 181 | info = {"status": status, "available_gpus": available_gpus, "msg": msg} 182 | logger.info(f"client {client_id} ping: {info}") 183 | return jsonify(info) 184 | 185 | 186 | @app.route("/client/register", methods=["POST"]) 187 | @login_required 188 | def client_register(): 189 | client_info = ClientModel(**request.json) 190 | status = "" 191 | msg = "" 192 | if len(client_info.gpus) <= 0: 193 | status = "err" 194 | msg = "gpus must not be empty!" 195 | elif not ClientMode.has_value(client_info.mode): 196 | status = "err" 197 | msg = f"mode {client_info.mode} is not supported" 198 | elif not check_gpus_existence(client_info.gpus): 199 | status = "err" 200 | msg = "check the gpus existence" 201 | elif client_info.mode == ClientMode.SCHEDULE and not check_req_gpu_num( 202 | client_info.req_gpu_num 203 | ): 204 | status = "err" 205 | msg = "`req_gpu_num` is not valid" 206 | else: 207 | if client_info.id not in cc: 208 | client = ClientModel( 209 | id=client_info.id, 210 | mode=client_info.mode, 211 | status=ClientStatus.WAITING, 212 | register_time=datetime.datetime.now(), 213 | last_request_time=datetime.datetime.now(), 214 | queue_num=len(cc.work_queue), 215 | gpus=client_info.gpus, 216 | req_gpu_num=client_info.req_gpu_num, 217 | ) 218 | cc.work_queue[client.id] = client 219 | status = "ok" 220 | else: 221 | status = "err" 222 | msg = f"client_id: {client_info.id} has been registered!" 223 | return jsonify({"status": status, "msg": msg}) 224 | 225 | 226 | @app.route("/client/cancel", methods=["POST"]) 227 | @login_required 228 | def client_cancel(): 229 | status = "" 230 | msg = "" 231 | try: 232 | client_id = request.json.get("id") 233 | if client_id in cc: 234 | client = cc[client_id] 235 | if client.status in [ClientStatus.WAITING, ClientStatus.READY]: 236 | client.status = ClientStatus.CANCELLED 237 | cc.mark_finished(client_id) 238 | status = "ok" 239 | msg = f"Client {client_id} cancelled successfully" 240 | else: 241 | status = "err" 242 | msg = f"Client {client_id} is not waiting" 243 | else: 244 | status = "err" 245 | msg = f"Client {client_id} not found" 246 | except Exception as err: 247 | status = "err" 248 | msg = str(err) 249 | return jsonify({"status": status, "msg": msg}) 250 | 251 | 252 | @app.route("/show/work", methods=["GET"]) 253 | @login_required 254 | def show_work(): 255 | status = "" 256 | msg = "" 257 | try: 258 | status = "ok" 259 | msg = [x.dict() for x in cc.work_queue.values()] 260 | except Exception as err: 261 | status = "err" 262 | msg = str(err) 263 | return jsonify({"status": status, "msg": msg}) 264 | 265 | 266 | @app.route("/show/finished", methods=["GET"]) 267 | @login_required 268 | def show_finished(): 269 | status = "" 270 | msg = "" 271 | try: 272 | status = "ok" 273 | msg = [x.dict() for x in cc.finished_queue.values()] 274 | except Exception as err: 275 | status = "err" 276 | msg = str(err) 277 | return jsonify({"status": status, "msg": msg}) 278 | 279 | 280 | @app.route("/show/gpus", methods=["GET"]) 281 | @login_required 282 | def show_gpus(): 283 | status = "" 284 | msg = "" 285 | try: 286 | status = "ok" 287 | msg = gpu_info.gs.jsonify() 288 | msg["query_time"] = str(msg["query_time"]) 289 | except Exception as err: 290 | status = "err" 291 | msg = str(err) 292 | return jsonify({"status": status, "msg": msg}) 293 | 294 | 295 | @app.route("/api", methods=["GET", "OPTIONS"]) 296 | @login_required 297 | def api(): 298 | if request.method == "OPTIONS": 299 | response = make_response() 300 | else: 301 | gpu_info = show_gpus() 302 | gpu_msg = gpu_info.json["msg"] 303 | work_info = show_work() 304 | work_msg = work_info.json["msg"] 305 | finished_info = show_finished() 306 | finished_msg = finished_info.json["msg"] 307 | response = jsonify( 308 | {"gpu": gpu_msg, "work_queue": work_msg, "finished_queue": finished_msg} 309 | ) 310 | response.headers["Access-Control-Allow-Origin"] = "*" 311 | return response 312 | 313 | 314 | @app.route("/", methods=["GET"]) 315 | def index(): 316 | global APP_PORT 317 | is_authenticated = session.get("authenticated", False) or not AUTH_TOKEN 318 | auth_required = AUTH_TOKEN is not None 319 | return render_template( 320 | "index.html", 321 | port=APP_PORT, 322 | is_authenticated=is_authenticated, 323 | auth_required=auth_required, 324 | ) 325 | 326 | 327 | @app.route("/old", methods=["GET"]) 328 | @login_required 329 | def old_index(): 330 | gpu_info = show_gpus() 331 | gpu_msg = gpu_info.json["msg"] 332 | work_info = show_work() 333 | work_msg = work_info.json 334 | finished_info = show_finished() 335 | finished_msg = finished_info.json 336 | return render_template( 337 | "old_index.html", gpu_msg=gpu_msg, work_msg=work_msg, finished_msg=finished_msg 338 | ) 339 | 340 | 341 | def check_gpu_info(): 342 | gpu_info.new_query() 343 | logger.info("check gpu info") 344 | 345 | 346 | def check_work(queue_timeout): 347 | logger.info("regular check") 348 | marked_finished = [] 349 | reserved_gpus = set() 350 | client_list = [] 351 | queue_num = 0 352 | for client_id, client in cc.work_queue.items(): 353 | time_delta = datetime.datetime.now() - client.last_request_time 354 | logger.info( 355 | f"client: {client.id}, time_delta.seconds: {time_delta.seconds}, time_delta: {time_delta}" 356 | ) 357 | if time_delta.seconds > queue_timeout: 358 | if client.status != ClientStatus.READY: 359 | client.status = ClientStatus.TIMEOUT 360 | else: 361 | client.status = ClientStatus.OK 362 | # invalid client 363 | client.queue_num = -1 364 | marked_finished.append(client_id) 365 | continue 366 | client.queue_num = queue_num 367 | ok = False 368 | available_gpus = [] 369 | if client.status == ClientStatus.READY: 370 | reserved_gpus |= set(client.available_gpus) 371 | else: 372 | try: 373 | if client.mode == "queue": 374 | ok = gpu_info.is_gpus_available(client.gpus) 375 | available_gpus = client.gpus 376 | elif client.mode == "schedule": 377 | ok, available_gpus = gpu_info.is_req_gpu_num_satisfied( 378 | client.gpus, client.req_gpu_num 379 | ) 380 | else: 381 | raise RuntimeError(f"Not supported mode: {client.mode}") 382 | except IndexError as err: 383 | client.msg = str(err) 384 | except ValueError as err: 385 | client.msg = str(err) 386 | except RuntimeError as err: 387 | client.msg = str(err) 388 | 389 | client_list.append([client_id, client, ok, set(available_gpus)]) 390 | queue_num += 1 391 | 392 | # post check and assignment, and make sure gpus of `ready` clients will not be assigned to the others 393 | for client_id, client, ok, available_gpu_set in client_list: 394 | if ( 395 | ok 396 | and len(available_gpu_set) > 0 397 | and len(available_gpu_set & reserved_gpus) < 1 398 | ): 399 | client.status = ClientStatus.READY 400 | client.available_gpus = available_gpus 401 | reserved_gpus |= set(client.available_gpus) 402 | logger.info( 403 | f"client: {client.id} is ready, available gpus: {client.available_gpus}" 404 | ) 405 | 406 | for client_id in marked_finished: 407 | logger.info(f"client {client.id} marked as finished, status: {client.status}") 408 | cc.mark_finished(client_id) 409 | 410 | 411 | def check_finished(status_queue_keep_time): 412 | logger.info("check out-dated finished clients") 413 | marked_delete_ids = [] 414 | for client_id, client in cc.finished_queue.items(): 415 | delta = datetime.datetime.now() - client.last_request_time 416 | if (delta.days * 24 + delta.seconds / 3600) >= status_queue_keep_time: 417 | marked_delete_ids.append(client_id) 418 | for client_id in marked_delete_ids: 419 | cc.finished_queue.pop(client_id) 420 | logger.info(f"remove {client.id} from finished queue") 421 | 422 | 423 | def regular_check(request_interval, queue_timeout, status_queue_keep_time): 424 | scheduler = BlockingScheduler(logger=apscheduler_logger) 425 | scheduler.add_job( 426 | check_gpu_info, 427 | trigger="interval", 428 | seconds=request_interval, 429 | next_run_time=datetime.datetime.now(), 430 | ) 431 | scheduler.add_job( 432 | check_work, 433 | trigger="interval", 434 | seconds=request_interval * 5, 435 | args=(queue_timeout,), 436 | next_run_time=datetime.datetime.now(), 437 | ) 438 | if status_queue_keep_time != -1: 439 | scheduler.add_job( 440 | check_finished, 441 | trigger="interval", 442 | hours=status_queue_keep_time, 443 | args=(status_queue_keep_time,), 444 | next_run_time=datetime.datetime.now(), 445 | ) 446 | scheduler.start() 447 | 448 | 449 | def api_server(host, port): 450 | global APP_PORT 451 | APP_PORT = port 452 | app.run(host=host, port=port) 453 | 454 | 455 | if __name__ == "__main__": 456 | parser = argparse.ArgumentParser() 457 | parser.add_argument( 458 | "--host", type=str, default="0.0.0.0", help="host address for api server" 459 | ) 460 | parser.add_argument("--port", type=str, default=62333, help="port for api server") 461 | parser.add_argument( 462 | "--queue_timeout", 463 | type=int, 464 | default=300, 465 | help="timeout for queue waiting (seconds)", 466 | ) 467 | parser.add_argument( 468 | "--request_interval", 469 | type=int, 470 | default=1, 471 | help="interval for gpu status requesting (seconds)", 472 | ) 473 | parser.add_argument( 474 | "--status_queue_keep_time", 475 | type=int, 476 | default=48, 477 | help=( 478 | "hours for keeping the client status. " 479 | "set `-1` to keep all clients' status" 480 | ), 481 | ) 482 | parser.add_argument( 483 | "--token", 484 | type=str, 485 | default="", 486 | help="Authentication token for accessing the web interface. If empty, a token will be generated. Set to 'none' to disable authentication.", 487 | ) 488 | args = parser.parse_args() 489 | 490 | # Handle token authentication 491 | if args.token.lower() == "none": 492 | AUTH_TOKEN = None 493 | if os.path.exists(TOKEN_FILE): 494 | os.remove(TOKEN_FILE) # Remove token file if auth is disabled 495 | logger.info("Authentication disabled") 496 | else: 497 | # Check for token from command line, file, or generate a new one 498 | if args.token: 499 | AUTH_TOKEN = args.token 500 | save_token_to_file(AUTH_TOKEN) 501 | else: 502 | AUTH_TOKEN = load_token_from_file() 503 | if not AUTH_TOKEN: 504 | AUTH_TOKEN = generate_token() 505 | save_token_to_file(AUTH_TOKEN) 506 | 507 | logger.info(f"Authentication enabled with token: {AUTH_TOKEN}") 508 | logger.info(f"Token saved to {os.path.abspath(TOKEN_FILE)}") 509 | 510 | logger.info(f"Running at: {args.host}:{args.port}") 511 | logger.info(f"Current pid: {os.getpid()} > {PID_FILE}") 512 | with open(PID_FILE, "wt", encoding="utf-8") as fout: 513 | fout.write(f"{os.getpid()}") 514 | 515 | # daemon threads will end automaticly if the main thread ends 516 | # thread 1: check gpu and client info regularly 517 | check_worker = threading.Thread( 518 | name="check", 519 | target=regular_check, 520 | args=(args.request_interval, args.queue_timeout, args.status_queue_keep_time), 521 | daemon=True, 522 | ) 523 | 524 | # thread 2: main server api backend 525 | api_server_worker = threading.Thread( 526 | name="api", target=api_server, args=(args.host, args.port), daemon=True 527 | ) 528 | 529 | check_worker.start() 530 | logger.info("check worker started") 531 | api_server_worker.start() 532 | logger.info("api server started") 533 | 534 | while True: 535 | try: 536 | if not check_worker.is_alive(): 537 | logger.error("check worker is not alive, server quit") 538 | raise RuntimeError("check worker is not alive, server quit") 539 | if not api_server_worker.is_alive(): 540 | logger.error("api server worker is not alive, server quit") 541 | raise RuntimeError("api server worker is not alive, server quit") 542 | except RuntimeError: 543 | logger.error("runtime error, kill the server") 544 | break 545 | except KeyboardInterrupt: 546 | logger.error("keyboard interrupted, kill the server") 547 | break 548 | logger.error("bye") 549 | -------------------------------------------------------------------------------- /watchmen/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Watchmen 8 | 111 | 112 | 113 | 114 | 119 | 120 |
121 |
122 | Connection: 123 | 124 | 125 |
126 | 127 |
128 | Host Name: 129 | 130 |
131 | 132 |
133 | Query Time: 134 | 135 |
136 | 137 |
138 |
139 | 140 | 141 | 142 | 157 | 158 | 177 | 178 | 196 | 197 | 198 | 432 | 433 | 434 | -------------------------------------------------------------------------------- /watchmen/templates/old_index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Watchmen 8 | 9 | 10 | 11 |

Watchmen GPU Scheduler

12 |
13 |

GPU Status

14 | {% if 'hostname' in gpu_msg %} 15 | 16 |

{{ gpu_msg.hostname }} : {{ gpu_msg.query_time }}

17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | {% for gpu in gpu_msg.gpus %} 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {% endfor %} 38 | 39 |
IndexNameTempUtilMemory#Process
{{ gpu.index }}{{ gpu.name }}{{ gpu['temperature.gpu'] }}{{ gpu['utilization.gpu'] }}{{ gpu['memory.used'] }} / {{ gpu['memory.total'] }}{{ gpu.processes|length }}
40 | {% else %} 41 |

{{ gpu_msg }}

42 | {% endif %} 43 |
44 |
45 |

Working Queue Status

46 | {% if work_msg.status == "ok" %} 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | {% for client in work_msg.msg %} 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 77 | 78 | {% endfor %} 79 | 80 |
IDModeGPU ScopeRequest GPU NumQueue NumLast Request TimeStatus
{{ client.id }}{{ client.mode }}{{ client.gpus }}{{ client.req_gpu_num }}{{ client.queue_num }}{{ client.last_request_time }} 69 | {% if client.status == "ok" %} 70 |

OK

71 | {% elif client.status == "waiting" %} 72 |

Waiting

73 | {% elif client.status == "timeout" %} 74 |

Timeout

75 | {% endif %} 76 |
81 | {% else %} 82 |

{{ work_msg }}

83 | {% endif %} 84 |
85 |
86 |

Finished Queue Status

87 | {% if finished_msg.status == "ok" %} 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | {% for client in finished_msg.msg %} 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 120 | 121 | {% endfor %} 122 | 123 |
IDModeRequest GPU NumGPU ScopeAvailable GPUsQueue NumLast Request TimeStatus
{{ client.id }}{{ client.mode }}{{ client.req_gpu_num }}{{ client.gpus }}{{ client.available_gpus }}{{ client.queue_num }}{{ client.last_request_time }} 112 | {% if client.status == "ok" %} 113 |

OK

114 | {% elif client.status == "waiting" %} 115 |

Waiting

116 | {% elif client.status == "timeout" %} 117 |

Timeout

118 | {% endif %} 119 |
124 | {% else %} 125 |

{{ finished_msg }}

126 | {% endif %} 127 |
128 | 129 | 130 | -------------------------------------------------------------------------------- /watchmen/wait.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import string 4 | import sys 5 | 6 | from watchmen import WatchClient 7 | 8 | 9 | def parse_args(in_args=None): 10 | arg_parser = argparse.ArgumentParser() 11 | arg_parser.add_argument("--task_name", type=str, required=True, help="Take Name") 12 | arg_parser.add_argument("--cuda", type=str, required=True, help="cuda to be waited") 13 | arg_parser.add_argument( 14 | "--req_gpu_num", 15 | type=int, 16 | required=False, 17 | default=1, 18 | help="request number of gpus", 19 | ) 20 | arg_parser.add_argument( 21 | "--wait", 22 | choices=["schedule", "queue", "none"], 23 | default="none", 24 | help="scheduling/queue wait", 25 | ) 26 | arg_parser.add_argument( 27 | "--token", 28 | type=str, 29 | default="", 30 | help="authentication token", 31 | ) 32 | arg_parser.add_argument( 33 | "--host", 34 | type=str, 35 | default="127.0.0.1", 36 | help="host", 37 | ) 38 | arg_parser.add_argument( 39 | "--port", 40 | type=int, 41 | default=62333, 42 | help="port", 43 | ) 44 | arg_info = arg_parser.parse_args(args=in_args) 45 | return arg_info 46 | 47 | 48 | if __name__ == "__main__": 49 | in_argv = parse_args() 50 | if in_argv.wait == "none": 51 | sys.exit(0) 52 | random_id = "-" + "".join(random.sample(string.ascii_letters + string.digits, 8)) 53 | exp_id = in_argv.task_name + random_id 54 | watch_client = WatchClient( 55 | id=exp_id, 56 | gpus=eval(f"[{in_argv.cuda}]"), 57 | server_host=in_argv.host, 58 | server_port=in_argv.port, 59 | req_gpu_num=in_argv.req_gpu_num, 60 | mode=in_argv.wait, 61 | timeout=60, 62 | token=in_argv.token, 63 | ) 64 | available_gpus = watch_client.wait() 65 | available_gpus = [str(x) for x in available_gpus] 66 | print(",".join(available_gpus), end="") 67 | -------------------------------------------------------------------------------- /working_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/working_queue.png --------------------------------------------------------------------------------