├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── example
    ├── multi_card_mnist.py
    ├── single_card_mnist.py
    └── wait.py
├── finished_queue.png
├── homepage.png
├── requirements.txt
├── setup.py
├── tests
    ├── __init__.py
    └── test_client.py
├── watchmen
    ├── __init__.py
    ├── client.py
    ├── listener.py
    ├── reminder.py
    ├── server.py
    ├── templates
    │   ├── index.html
    │   └── old_index.html
    └── wait.py
└── working_queue.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # self-defined
132 | .vscode/
133 | data/
134 | .watchmen_server.pid
135 | .watchmen_server.token
136 | .watchmen_client.token
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Spico
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | build: clean
 2 | 	python3 setup.py sdist bdist_wheel
 3 | 
 4 | test_upload:
 5 | 	python3 -m twine upload --verbose --repository testpypi dist/*
 6 | 
 7 | upload:
 8 | 	python3 -m twine upload --repository pypi dist/*
 9 | 
10 | clean:
11 | 	rm -rf build/
12 | 	rm -rf dist/
13 | 	rm -rf *.egg-info/
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Watchmen
  2 | A simple and easy-to-use toolkit for GPU scheduling.
  3 | 
  4 | ## Dependencies
  5 | - [Python](https://www.python.org/downloads/) >= 3.6
  6 |   - requests >= 2.24.0
  7 |   - pydantic >= 1.7.1
  8 |   - gpustat >= 0.6.0
  9 |   - flask >= 1.1.2
 10 |   - apscheduler >= 3.6.3
 11 | 
 12 | ## Installation
 13 | 
 14 | 1. Install dependencies. 
 15 | ```bash
 16 | $ pip install -r requirements.txt
 17 | ```
 18 | 
 19 | 2. Install watchmen.
 20 | 
 21 | Install from source code:
 22 | ```bash
 23 | $ pip install -e .
 24 | ```
 25 | 
 26 | Or you can install the stable version package from pypi.
 27 | ```bash
 28 | $ pip install gpu-watchmen -i https://pypi.org/simple
 29 | ```
 30 | 
 31 | ## Quick Start
 32 | 1. Start the server
 33 | 
 34 | The default port of the server is `62333`
 35 | ```bash
 36 | $ python -m watchmen.server
 37 | ```
 38 | 
 39 | If you want the server to be running backend, try:
 40 | ```bash
 41 | $ nohup python -m watchmen.server 1>watchmen.log 2>&1 &
 42 | ```
 43 | 
 44 | There are some configurations for the server
 45 | ```
 46 | usage: server.py [-h] [--host HOST] [--port PORT]
 47 |                  [--queue_timeout QUEUE_TIMEOUT]
 48 |                  [--request_interval REQUEST_INTERVAL]
 49 |                  [--status_queue_keep_time STATUS_QUEUE_KEEP_TIME]
 50 | 
 51 | optional arguments:
 52 |   -h, --help            show this help message and exit
 53 |   --host HOST           host address for api server
 54 |   --port PORT           port for api server
 55 |   --queue_timeout QUEUE_TIMEOUT
 56 |                         timeout for queue waiting (seconds)
 57 |   --request_interval REQUEST_INTERVAL
 58 |                         interval for gpu status requesting (seconds)
 59 |   --status_queue_keep_time STATUS_QUEUE_KEEP_TIME
 60 |                         hours for keeping the client status. set `-1` to keep all clients' status
 61 | ```
 62 | 
 63 | 2. Modify the source code in your project:
 64 | 
 65 | ```python
 66 | from watchmen import WatchClient
 67 | 
 68 | client = WatchClient(id="short description of this running", gpus=[1],
 69 |                      server_host="127.0.0.1", server_port=62333)
 70 | client.wait()
 71 | ```
 72 | 
 73 | When the program goes on after `client.wait()`, you are in the working queue.
 74 | Watchmen supports two requesting mode:
 75 | - `queue` mode means you are waiting for the gpus in `gpus` arguments.
 76 | - `schedule` mode means you are waiting for the server to spare `req_gpu_num` of available GPUs in `gpus`.
 77 | You can check examples in `example/` for further reading.
 78 | 
 79 | ```bash
 80 | # single card queue mode
 81 | $ cd example && python single_card_mnist.py --id="single" --cuda=0 --wait
 82 | # single card schedule mode
 83 | $ cd example && python single_card_mnist.py --id="single schedule" --cuda=0,2,3 --req_gpu_num=1 --wait_mode="schedule" --wait
 84 | # queue mode
 85 | $ cd example && python multi_card_mnist.py --id="multi" --cuda=2,3 --wait
 86 | # schedule mode
 87 | $ cd example && python multi_card_mnist.py --id='multi card scheduling wait' --cuda=1,0,3 --req_gpu_num=2 --wait="schedule"
 88 | ```
 89 | 
 90 | 3. Check the queue in browser.
 91 | 
 92 | Open the following link to your browser: `http://<server ip address>:<server port>`, for example: `http://192.168.126.143:62333`.
 93 | 
 94 | And you can get a result like the demo below.
 95 | Please be aware that the page is not going to change dynamically, so you can refresh the page manually to check the latest status.
 96 | 
 97 | Home page: GPU status
 98 | 
 99 | ![HomePage](homepage.png)
100 | 
101 | Working queue:
102 | ![WorkingQueue](working_queue.png)
103 | 
104 | Finished queue:
105 | ![FinishedQueue](finished_queue.png)
106 | 
107 | 
108 | 1. Reminder when program is finished.
109 | 
110 | `watchmen` also support email and other kinds of reminders for message informing.
111 | For example, you can send yourself an email when the program is finished.
112 | 
113 | ```python
114 | from watchmen.reminder import send_email
115 | 
116 | ... # your code here
117 | 
118 | send_email(
119 |     host="smtp.163.com", # email host to login, like `smtp.163.com`
120 |     port=25, # email port to login, like `25`
121 |     user="***@163.com", # user email address for login, like `***@163.com`
122 |     password="***", # password or auth code for login
123 |     receiver="***@outlook.com", # receiver email address
124 |     html_message="<h1>Your program is finished!</h1>", # content, html format supported
125 |     subject="Proram Finished Notice" # email subject
126 | )
127 | ```
128 | 
129 | To get more reminders, please check `watchmen/reminder.py`.
130 | 
131 | ## UPDATE
132 | - v0.4.0: add token authentication
133 | - v0.3.9: add `cancel` api and button in the working queue, fix json encoding bug with higher versions of flask
134 | - v0.3.8: change `OK` status to be shown only in the finished queue, and show `ready` in the working queue. Fix severe bug when scheduling
135 | - v0.3.7: much faster due to lock free changes! fix timeout and schedule bug
136 | - v0.3.6: fix front-end api hostname bug
137 | - v0.3.5: fix front-end api port bug
138 | - v0.3.4: refreshed interface, add `register_time` field, fix `check_finished` bug
139 | - v0.3.3: fix `check_finished` bug in server end, quit the main thread if the sub-thread is quit, and remove the backend cmd in the main thread
140 | - v0.3.2: fix `WatchClient` bug
141 | - v0.3.1: change `Client` into `WatchClient`, fix `ClientCollection` and `send_email` bug
142 | - v0.3.0: support gpu scheduling, fix blank input output, fix `check_gpus_existence`
143 | - v0.2.2: fix html package data, add multi-card example
144 | 
145 | ## TODO
146 | - [ ] import user authentication modules to help the working queue delete operations
147 | - [ ] read programs' pids to help reading program working status and kill tasks remotely
148 | - [ ] test and support distributed model parallel configurations (with `python -m torch.distributed.launch`)
149 | - [ ] prettify the web page and divide functions into different tabs
150 | - [ ] gpu using stats for each user and process
151 | - [x] quit the main thread if the sub-thread is quit
152 | - [x] change `Client` into `WatchClient`, in case of any ambiguity
153 | - [x] `ClientCollection/__contains__` function should not include `finished_queue`, to help the `id` releases
154 | - [x] subject bug in `reminder/send_email()`
155 | - [x] add schedule feature, so clients only have to request for a number and range of gpus, and the server will assign the gpu num to clients
156 | - [x] add reminders
157 | - [x] add webui html support
158 | - [x] add examples
159 | 


--------------------------------------------------------------------------------
/example/multi_card_mnist.py:
--------------------------------------------------------------------------------
  1 | """
  2 | modified from:
  3 | https://github.com/pytorch/examples/blob/master/mnist/main.py
  4 | """
  5 | from __future__ import print_function
  6 | import argparse
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.optim as optim
 11 | from torchvision import datasets, transforms
 12 | from torch.optim.lr_scheduler import StepLR
 13 | 
 14 | from watchmen import WatchClient, ClientMode
 15 | 
 16 | 
 17 | class Net(nn.Module):
 18 |     def __init__(self):
 19 |         super(Net, self).__init__()
 20 |         self.conv1 = nn.Conv2d(1, 32, 3, 1)
 21 |         self.conv2 = nn.Conv2d(32, 64, 3, 1)
 22 |         self.dropout1 = nn.Dropout(0.25)
 23 |         self.dropout2 = nn.Dropout(0.5)
 24 |         self.fc1 = nn.Linear(9216, 128)
 25 |         self.fc2 = nn.Linear(128, 10)
 26 | 
 27 |     def forward(self, x):
 28 |         x = self.conv1(x)
 29 |         x = F.relu(x)
 30 |         x = self.conv2(x)
 31 |         x = F.relu(x)
 32 |         x = F.max_pool2d(x, 2)
 33 |         x = self.dropout1(x)
 34 |         x = torch.flatten(x, 1)
 35 |         x = self.fc1(x)
 36 |         x = F.relu(x)
 37 |         x = self.dropout2(x)
 38 |         x = self.fc2(x)
 39 |         output = F.log_softmax(x, dim=1)
 40 |         return output
 41 | 
 42 | 
 43 | def train(args, model, device, train_loader, optimizer, epoch):
 44 |     model.train()
 45 |     for batch_idx, (data, target) in enumerate(train_loader):
 46 |         data, target = data.to(device), target.to(device)
 47 |         optimizer.zero_grad()
 48 |         output = model(data)
 49 |         loss = F.nll_loss(output, target)
 50 |         loss.backward()
 51 |         optimizer.step()
 52 |         if batch_idx % args.log_interval == 0:
 53 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
 54 |                 epoch, batch_idx * len(data), len(train_loader.dataset),
 55 |                 100. * batch_idx / len(train_loader), loss.item()))
 56 |             if args.dry_run:
 57 |                 break
 58 | 
 59 | 
 60 | def test(model, device, test_loader):
 61 |     model.eval()
 62 |     test_loss = 0
 63 |     correct = 0
 64 |     with torch.no_grad():
 65 |         for data, target in test_loader:
 66 |             data, target = data.to(device), target.to(device)
 67 |             output = model(data)
 68 |             test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
 69 |             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
 70 |             correct += pred.eq(target.view_as(pred)).sum().item()
 71 | 
 72 |     test_loss /= len(test_loader.dataset)
 73 | 
 74 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 75 |         test_loss, correct, len(test_loader.dataset),
 76 |         100. * correct / len(test_loader.dataset)))
 77 | 
 78 | 
 79 | def main():
 80 |     # Training settings
 81 |     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 82 |     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 83 |                         help='input batch size for training (default: 64)')
 84 |     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 85 |                         help='input batch size for testing (default: 1000)')
 86 |     parser.add_argument('--epochs', type=int, default=50, metavar='N',
 87 |                         help='number of epochs to train (default: 14)')
 88 |     parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
 89 |                         help='learning rate (default: 1.0)')
 90 |     parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
 91 |                         help='Learning rate step gamma (default: 0.7)')
 92 |     parser.add_argument('--dry-run', action='store_true', default=False,
 93 |                         help='quickly check a single pass')
 94 |     parser.add_argument('--seed', type=int, default=1, metavar='S',
 95 |                         help='random seed (default: 1)')
 96 |     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 97 |                         help='how many batches to wait before logging training status')
 98 |     parser.add_argument('--save-model', action='store_true', default=False,
 99 |                         help='For Saving the current Model')
100 |     parser.add_argument("--id", type=str, default="id",
101 |                         help="identifier")
102 |     parser.add_argument("--cuda", type=str,
103 |                         help="cuda device")
104 |     parser.add_argument("--req_gpu_num", type=int, default=0,
105 |                         help="request gpu number if is `schedule` mode")
106 |     parser.add_argument("--wait", type=str, default="queue",
107 |                         choices=["queue", "schedule", "none"],
108 |                         help="wait for watchmen signal")
109 |     args = parser.parse_args()
110 |     torch.manual_seed(args.seed)
111 | 
112 |     cudas = sorted(list(map(int, args.cuda.split(','))))
113 |     device = torch.device(f"cuda:{cudas[0]}")
114 | 
115 |     """WATCHMEN"""
116 |     if args.wait == "queue":
117 |         # queue wait
118 |         client = WatchClient(id=f"mnist multi card {args.id} cuda={args.cuda}", gpus=cudas,
119 |                              server_host="127.0.0.1", server_port=62333)
120 |     elif args.wait == "schedule":
121 |         # scheduling wait
122 |         client = WatchClient(id=f"mnist multi card {args.id} cuda={args.cuda}",
123 |                              gpus=cudas, mode=ClientMode.SCHEDULE, req_gpu_num=args.req_gpu_num,
124 |                              server_host="127.0.0.1", server_port=62334)
125 |     device_ids = client.wait()
126 |     """END OF WATCHMEN"""
127 | 
128 |     train_kwargs = {'batch_size': args.batch_size}
129 |     test_kwargs = {'batch_size': args.test_batch_size}
130 |     
131 |     cuda_kwargs = {'num_workers': 1,
132 |                     'pin_memory': True,
133 |                     'shuffle': True}
134 |     train_kwargs.update(cuda_kwargs)
135 |     test_kwargs.update(cuda_kwargs)
136 | 
137 |     transform=transforms.Compose([
138 |         transforms.ToTensor(),
139 |         transforms.Normalize((0.1307,), (0.3081,))
140 |         ])
141 |     
142 |     dataset1 = datasets.MNIST('../data', train=True, download=True,
143 |                        transform=transform)
144 |     dataset2 = datasets.MNIST('../data', train=False,
145 |                        transform=transform)
146 |     train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
147 |     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
148 | 
149 |     model = Net()
150 |     model = nn.DataParallel(model, device_ids=device_ids)
151 |     model.to(device)
152 |     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
153 | 
154 |     scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
155 |     for epoch in range(1, args.epochs + 1):
156 |         train(args, model, device, train_loader, optimizer, epoch)
157 |         test(model, device, test_loader)
158 |         scheduler.step()
159 | 
160 |     if args.save_model:
161 |         torch.save(model.module.state_dict(), "mnist_cnn.pt")
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     main()
166 | 


--------------------------------------------------------------------------------
/example/single_card_mnist.py:
--------------------------------------------------------------------------------
  1 | """
  2 | modified from:
  3 | https://github.com/pytorch/examples/blob/master/mnist/main.py
  4 | """
  5 | from __future__ import print_function
  6 | 
  7 | import sys
  8 | import argparse
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torchvision import datasets, transforms
 15 | from torch.optim.lr_scheduler import StepLR
 16 | 
 17 | from watchmen import WatchClient
 18 | from watchmen.client import ClientMode
 19 | 
 20 | 
 21 | class Net(nn.Module):
 22 |     def __init__(self):
 23 |         super(Net, self).__init__()
 24 |         self.conv1 = nn.Conv2d(1, 32, 3, 1)
 25 |         self.conv2 = nn.Conv2d(32, 64, 3, 1)
 26 |         self.dropout1 = nn.Dropout(0.25)
 27 |         self.dropout2 = nn.Dropout(0.5)
 28 |         self.fc1 = nn.Linear(9216, 128)
 29 |         self.fc2 = nn.Linear(128, 10)
 30 | 
 31 |     def forward(self, x):
 32 |         x = self.conv1(x)
 33 |         x = F.relu(x)
 34 |         x = self.conv2(x)
 35 |         x = F.relu(x)
 36 |         x = F.max_pool2d(x, 2)
 37 |         x = self.dropout1(x)
 38 |         x = torch.flatten(x, 1)
 39 |         x = self.fc1(x)
 40 |         x = F.relu(x)
 41 |         x = self.dropout2(x)
 42 |         x = self.fc2(x)
 43 |         output = F.log_softmax(x, dim=1)
 44 |         return output
 45 | 
 46 | 
 47 | def train(args, model, device, train_loader, optimizer, epoch):
 48 |     model.train()
 49 |     for batch_idx, (data, target) in enumerate(train_loader):
 50 |         data, target = data.to(device), target.to(device)
 51 |         optimizer.zero_grad()
 52 |         output = model(data)
 53 |         loss = F.nll_loss(output, target)
 54 |         loss.backward()
 55 |         optimizer.step()
 56 |         if batch_idx % args.log_interval == 0:
 57 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
 58 |                 epoch, batch_idx * len(data), len(train_loader.dataset),
 59 |                 100. * batch_idx / len(train_loader), loss.item()))
 60 |             if args.dry_run:
 61 |                 break
 62 | 
 63 | 
 64 | def test(model, device, test_loader):
 65 |     model.eval()
 66 |     test_loss = 0
 67 |     correct = 0
 68 |     with torch.no_grad():
 69 |         for data, target in test_loader:
 70 |             data, target = data.to(device), target.to(device)
 71 |             output = model(data)
 72 |             test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
 73 |             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
 74 |             correct += pred.eq(target.view_as(pred)).sum().item()
 75 | 
 76 |     test_loss /= len(test_loader.dataset)
 77 | 
 78 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 79 |         test_loss, correct, len(test_loader.dataset),
 80 |         100. * correct / len(test_loader.dataset)))
 81 | 
 82 | 
 83 | def main():
 84 |     # Training settings
 85 |     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 86 |     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 87 |                         help='input batch size for training (default: 64)')
 88 |     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 89 |                         help='input batch size for testing (default: 1000)')
 90 |     parser.add_argument('--epochs', type=int, default=50, metavar='N',
 91 |                         help='number of epochs to train (default: 14)')
 92 |     parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
 93 |                         help='learning rate (default: 1.0)')
 94 |     parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
 95 |                         help='Learning rate step gamma (default: 0.7)')
 96 |     parser.add_argument('--dry-run', action='store_true', default=False,
 97 |                         help='quickly check a single pass')
 98 |     parser.add_argument('--seed', type=int, default=1, metavar='S',
 99 |                         help='random seed (default: 1)')
100 |     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
101 |                         help='how many batches to wait before logging training status')
102 |     parser.add_argument('--save-model', action='store_true', default=False,
103 |                         help='For Saving the current Model')
104 |     parser.add_argument("--id", type=str, default="id",
105 |                         help="identifier")
106 |     parser.add_argument("--cuda", type=str, default="0",
107 |                         help="cuda devices, seperated by `,` with no spaces")
108 |     parser.add_argument("--wait", action="store_true",
109 |                         help="wait for watchmen signal")
110 |     parser.add_argument("--wait_mode", type=str,
111 |                         choices=["queue", "schedule"], default="queue",
112 |                         help="gpu waiting mode")
113 |     args = parser.parse_args()
114 |     torch.manual_seed(args.seed)
115 | 
116 |     """WATCHMEN"""
117 |     if args.wait:
118 |         if args.wait_mode == 'queue':
119 |             waiting_mode = ClientMode.QUEUE
120 |         else:
121 |             waiting_mode = ClientMode.SCHEDULE
122 |         client = WatchClient(id=f"mnist single card {args.id} cuda={args.cuda}",
123 |                              gpus=eval(f"[{args.cuda}]"),
124 |                              req_gpu_num=1, mode=waiting_mode,
125 |                              server_host="127.0.0.1", server_port=62333)
126 |         # client.register()
127 |         available_gpus = []
128 |         available_gpus = client.wait()
129 |         if len(available_gpus) <= 0:
130 |             sys.exit(1)
131 |         else:
132 |             device = torch.device(f"cuda:{available_gpus[0]}")
133 |     """END OF WATCHMEN"""
134 | 
135 |     train_kwargs = {'batch_size': args.batch_size}
136 |     test_kwargs = {'batch_size': args.test_batch_size}
137 | 
138 |     cuda_kwargs = {'num_workers': 1,
139 |                    'pin_memory': True,
140 |                    'shuffle': True}
141 |     train_kwargs.update(cuda_kwargs)
142 |     test_kwargs.update(cuda_kwargs)
143 | 
144 |     transform=transforms.Compose([
145 |         transforms.ToTensor(),
146 |         transforms.Normalize((0.1307,), (0.3081,))
147 |         ])
148 | 
149 |     dataset1 = datasets.MNIST('../data', train=True, download=True,
150 |                        transform=transform)
151 |     dataset2 = datasets.MNIST('../data', train=False,
152 |                        transform=transform)
153 |     train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
154 |     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
155 | 
156 |     model = Net().to(device)
157 |     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
158 | 
159 |     scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
160 |     for epoch in range(1, args.epochs + 1):
161 |         train(args, model, device, train_loader, optimizer, epoch)
162 |         test(model, device, test_loader)
163 |         scheduler.step()
164 | 
165 |     if args.save_model:
166 |         torch.save(model.state_dict(), "mnist_cnn.pt")
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/example/wait.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | import torch
 5 | 
 6 | from watchmen import WatchClient
 7 | from watchmen.client import ClientMode
 8 | 
 9 | 
10 | def main():
11 |     # Training settings
12 |     parser = argparse.ArgumentParser(description='Minimal GPU Scheduling Example')
13 |     parser.add_argument("--id", type=str, default="id",
14 |                         help="identifier")
15 |     parser.add_argument("--cuda", type=str, default="0",
16 |                         help="cuda devices, seperated by `,` with no spaces")
17 |     parser.add_argument("--wait", action="store_true",
18 |                         help="wait for watchmen signal")
19 |     parser.add_argument("--wait_mode", type=str,
20 |                         choices=["queue", "schedule"], default="queue",
21 |                         help="gpu waiting mode")
22 |     parser.add_argument("--req_gpu_num", type=int, default=1,
23 |                         help="number of GPUs to request")
24 |     parser.add_argument("--token", type=str, default="",
25 |                         help="authentication token")
26 |     args = parser.parse_args()
27 | 
28 |     """WATCHMEN"""
29 |     if args.wait:
30 |         if args.wait_mode == 'queue':
31 |             waiting_mode = ClientMode.QUEUE
32 |         else:
33 |             waiting_mode = ClientMode.SCHEDULE
34 |         client = WatchClient(id=f"mnist single card {args.id} cuda={args.cuda}",
35 |                              gpus=eval(f"[{args.cuda}]"),
36 |                              req_gpu_num=args.req_gpu_num, mode=waiting_mode,
37 |                              server_host="127.0.0.1", server_port=62333,
38 |                              token=args.token)
39 |         # client.register()
40 |         available_gpus = []
41 |         available_gpus = client.wait()
42 |         if len(available_gpus) <= 0:
43 |             sys.exit(1)
44 |         else:
45 |             device = torch.device(f"cuda:{available_gpus[0]}")
46 |     """END OF WATCHMEN"""
47 |     print(f"Using GPU: {device}")
48 |     input("Press Enter to continue...")
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/finished_queue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/finished_queue.png


--------------------------------------------------------------------------------
/homepage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/homepage.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | APScheduler>=3.6.3
2 | Flask>=1.1.2
3 | gpustat>=0.6.0
4 | pydantic>=1.7.1
5 | requests>=2.24.0
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | from watchmen import __version__
 4 | 
 5 | 
 6 | with open("README.md", "r") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | setuptools.setup(
10 |     name='gpu-watchmen',
11 |     version=__version__,
12 |     author="Tong Zhu",
13 |     author_email="tzhu1997@outlook.com",
14 |     description="watchmen for GPU scheduling",
15 |     long_description_content_type="text/markdown",
16 |     long_description=long_description,
17 |     url="https://github.com/Spico197/watchmen",
18 |     packages=[
19 |         "watchmen"
20 |     ],
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Operating System :: POSIX :: Linux"
25 |     ],
26 |     python_requires='>=3.6',
27 |     install_requires=[
28 |         "apscheduler>=3.6.3",
29 |         "flask>=1.1.2",
30 |         "gpustat>=0.6.0",
31 |         "pydantic>=1.7.1",
32 |         "requests>=2.24.0",
33 |     ],
34 |     package_data={
35 |         'watchmen' : [
36 |             'templates/*.html'
37 |         ],
38 |     },
39 |     include_package_data=True,
40 | )
41 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_client.py:
--------------------------------------------------------------------------------
1 | from watchmen.client import ClientMode
2 | 
3 | 
4 | def test_in_mode_method():
5 |     assert ClientMode.has_value("queue") is True
6 | 


--------------------------------------------------------------------------------
/watchmen/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import WatchClient
2 | from .client import ClientMode
3 | 
4 | __version__ = "0.4.0"
5 | 


--------------------------------------------------------------------------------
/watchmen/client.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import logging
  3 | import datetime
  4 | import getpass
  5 | import os
  6 | from enum import Enum
  7 | from typing import List, Optional
  8 | from collections import OrderedDict
  9 | 
 10 | import requests
 11 | from pydantic import BaseModel
 12 | 
 13 | from watchmen.listener import check_gpus_existence, check_req_gpu_num
 14 | 
 15 | 
 16 | logger = logging.getLogger("common")
 17 | logger.setLevel(logging.INFO)  # Change to INFO to see info messages
 18 | 
 19 | # Add a handler if none exists
 20 | if not logger.handlers:
 21 |     console_handler = logging.StreamHandler()
 22 |     console_handler.setLevel(logging.INFO)
 23 |     formatter = logging.Formatter(
 24 |         "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 25 |     )
 26 |     console_handler.setFormatter(formatter)
 27 |     logger.addHandler(console_handler)
 28 | 
 29 | TOKEN_FILE = ".watchmen_client.token"
 30 | 
 31 | 
 32 | class ClientStatus(str, Enum):
 33 |     WAITING = "waiting"
 34 |     TIMEOUT = "timeout"
 35 |     READY = "ready"
 36 |     OK = "ok"
 37 |     CANCELLED = "cancelled"
 38 | 
 39 | 
 40 | class ClientMode(str, Enum):
 41 |     QUEUE = "queue"
 42 |     SCHEDULE = "schedule"
 43 | 
 44 |     @classmethod
 45 |     def has_value(cls, value):
 46 |         return value in set(cls._member_map_.values())
 47 | 
 48 | 
 49 | class ClientModel(BaseModel):
 50 |     id: str  # identifier in string format
 51 |     # `queue` (wait for specific gpus) or `schedule` (schedule by the server automatically)
 52 |     mode: Optional[ClientMode] = ClientMode.QUEUE
 53 |     register_time: Optional[datetime.datetime] = None  # datetime.datetime
 54 |     last_request_time: Optional[datetime.datetime] = None  # datetime.datetime
 55 |     status: Optional[ClientStatus] = ClientStatus.WAITING  # `waiting`, `timeout`, `ok`
 56 |     queue_num: Optional[int] = 0  # queue number
 57 |     # `queue` mode: gpus for requesting to run on; `schedule` mode: available gpu scope.
 58 |     gpus: Optional[List[int]] = []
 59 |     msg: Optional[str] = ""  # error or status message
 60 |     req_gpu_num: Optional[int] = 0  # `schedule` mode: how many gpus are requested
 61 |     available_gpus: Optional[List[int]] = []
 62 | 
 63 | 
 64 | class ClientCollection(object):
 65 |     def __init__(self):
 66 |         self.work_queue = OrderedDict()  # only `ok` and `waiting`
 67 |         self.finished_queue = OrderedDict()
 68 | 
 69 |     def mark_finished(self, client_id: str):
 70 |         self.finished_queue[client_id] = self.work_queue[client_id]
 71 |         self.work_queue.pop(client_id)
 72 | 
 73 |     def get_all_clients(self):
 74 |         all_clients = []
 75 |         all_clients.extend(list(self.finished_queue.values()))
 76 |         all_clients.sort(key=lambda x: x.last_request_time)
 77 |         all_clients.extend(list(self.work_queue.values()))
 78 |         return all_clients
 79 | 
 80 |     def __getitem__(self, index: str):
 81 |         if index in self.work_queue:
 82 |             return self.work_queue[index]
 83 |         else:
 84 |             raise IndexError(f"index: {index} does not exist or has finished")
 85 | 
 86 |     def __contains__(self, index: str):
 87 |         return index in self.work_queue
 88 | 
 89 | 
 90 | def load_token_from_file():
 91 |     """Load authentication token from file if it exists."""
 92 |     if os.path.exists(TOKEN_FILE):
 93 |         with open(TOKEN_FILE, "r") as f:
 94 |             return f.read().strip()
 95 |     return None
 96 | 
 97 | 
 98 | def save_token_to_file(token):
 99 |     """Save authentication token to file."""
100 |     with open(TOKEN_FILE, "w") as f:
101 |         f.write(token)
102 | 
103 | 
104 | class WatchClient(object):
105 |     def __init__(
106 |         self,
107 |         id: str,
108 |         gpus: List[int],
109 |         server_host: str,
110 |         server_port: int,
111 |         mode: Optional[ClientMode] = ClientMode.QUEUE,
112 |         req_gpu_num: Optional[int] = 0,
113 |         timeout: Optional[int] = 60,
114 |         token: Optional[str] = None,
115 |     ):
116 |         self.base_url = f"http://{server_host}:{server_port}"
117 |         self.id = f"{getpass.getuser()}@{id}"
118 |         if self._validate_gpus(gpus):
119 |             self.gpus = gpus
120 |         else:
121 |             raise ValueError("Check the GPU existence")
122 |         if not self._validate_mode(mode):
123 |             raise ValueError(f"Check the mode: {mode}")
124 |         self.mode = mode
125 |         if self.mode == ClientMode.SCHEDULE:
126 |             if not self._validate_req_gpu_num(req_gpu_num):
127 |                 raise ValueError(f"Check the `req_gpu_num`: {req_gpu_num}")
128 |         self.req_gpu_num = req_gpu_num
129 |         self.timeout = timeout
130 | 
131 |         # Handle token authentication
132 |         self.token = token
133 |         if not self.token:
134 |             logger.info(f"No token provided, trying to load from file {TOKEN_FILE}")
135 |             self.token = load_token_from_file()
136 |         if self.token:
137 |             logger.info(f"Dump token to file {TOKEN_FILE}")
138 |             save_token_to_file(self.token)
139 |         else:
140 |             logger.info("No token provided, and no token file found")
141 | 
142 |     def _validate_gpus(self, gpus: List[int]):
143 |         return check_gpus_existence(gpus)
144 | 
145 |     def _validate_mode(self, mode: ClientMode):
146 |         return ClientMode.has_value(mode)
147 | 
148 |     def _validate_req_gpu_num(self, req_gpu_num: int):
149 |         return check_req_gpu_num(req_gpu_num)
150 | 
151 |     def _get_headers(self):
152 |         """Get request headers with authentication token if available."""
153 |         headers = {"Content-Type": "application/json"}
154 |         if self.token:
155 |             headers["X-Auth-Token"] = self.token
156 |         return headers
157 | 
158 |     def register(self):
159 |         data = {
160 |             "id": self.id,
161 |             "gpus": self.gpus,
162 |             "mode": self.mode,
163 |             "req_gpu_num": self.req_gpu_num,
164 |         }
165 |         result = requests.post(
166 |             self.base_url + "/client/register",
167 |             json=data,
168 |             headers=self._get_headers(),
169 |             timeout=self.timeout,
170 |         )
171 |         result = result.json()
172 |         if result["status"] != "ok":
173 |             raise RuntimeError(f"err registering: {result['msg']}")
174 | 
175 |     def ping(self):
176 |         data = {"id": self.id}
177 |         result = requests.post(
178 |             self.base_url + "/client/ping",
179 |             json=data,
180 |             headers=self._get_headers(),
181 |             timeout=self.timeout,
182 |         ).json()
183 |         if result["status"] != "ok":
184 |             raise RuntimeError(f"err registering: {result['msg']}")
185 |         else:
186 |             if result["msg"] == ClientStatus.WAITING:
187 |                 return False, result["available_gpus"]
188 |             elif result["msg"] == ClientStatus.READY:
189 |                 return True, result["available_gpus"]
190 |             elif result["msg"] == ClientStatus.OK:
191 |                 logger.warning("Status is OK, which has finished requesting GPUs.")
192 |                 return False, result["available_gpus"]
193 |             elif result["msg"] == ClientStatus.TIMEOUT:
194 |                 raise RuntimeError("status changed to TIMEOUT")
195 |             elif result["msg"] == ClientStatus.CANCELLED:
196 |                 raise RuntimeError("client has been cancelled")
197 | 
198 |     def wait(self):
199 |         self.register()
200 |         flag = False
201 |         available_gpus = []
202 |         while not flag:
203 |             flag, available_gpus = self.ping()
204 |             time.sleep(10)
205 |         return available_gpus
206 | 


--------------------------------------------------------------------------------
/watchmen/listener.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | from gpustat.core import GPUStatCollection
 5 | 
 6 | 
 7 | def is_single_gpu_totally_free(gpu_index: int):
 8 |     gs = GPUStatCollection.new_query()
 9 | 
10 |     if not isinstance(gpu_index, int):
11 |         raise ValueError(f"gpu_index: {gpu_index} is not int")
12 |     if gpu_index >= len(gs.gpus) or gpu_index < 0:
13 |         raise ValueError(f"gpu_index: {gpu_index} does not exist")
14 | 
15 |     gpu = gs.gpus[gpu_index]
16 |     if len(gpu.processes) <= 0 \
17 |             and gpu.utilization <= 10 \
18 |             and (float(gpu.memory_used) / float(gpu.memory_total) <= 1e-3 or gpu.memory_used < 50):
19 |         return True
20 |     else:
21 |         return False
22 | 
23 | 
24 | def check_gpus_existence(gpus: List[int]):
25 |     gs = GPUStatCollection.new_query()
26 |     for gpu in gpus:
27 |         try:
28 |             gs.gpus[gpu]
29 |         except KeyError:
30 |             return False
31 |     return True
32 | 
33 | 
34 | def check_req_gpu_num(req_gpu_num: int):
35 |     gs = GPUStatCollection.new_query()
36 |     return req_gpu_num <= len(gs.gpus)
37 | 
38 | 
39 | class GPUInfo(object):
40 |     def __init__(self):
41 |         self.gpus = []
42 |         self.new_query()
43 | 
44 |     def new_query(self):
45 |         gs = GPUStatCollection.new_query()
46 |         self.gpus = gs.gpus
47 |         self.gs = gs
48 | 
49 |     def _is_totally_free(self, gpu_index: int):
50 |         self.new_query()
51 |         gpu = self.gpus[gpu_index]
52 |         if len(gpu.processes) <= 0 \
53 |                 and gpu.utilization <= 10 \
54 |                 and (float(gpu.memory_used) / float(gpu.memory_total) <= 1e-3 or gpu.memory_used < 50):
55 |             return True
56 |         else:
57 |             return False
58 | 
59 |     def is_gpus_available(self, gpus: List[int]):
60 |         stts = []
61 |         for gpu in gpus:
62 |             stts.append(self._is_totally_free(gpu))
63 |         return all(stts)
64 | 
65 |     def get_available_gpus_in_scope(self, gpu_scope: List[int]):
66 |         available_gpus = []
67 |         for gpu in gpu_scope:
68 |             if self._is_totally_free(gpu):
69 |                 available_gpus.append(gpu)
70 |         return available_gpus
71 | 
72 |     def is_req_gpu_num_satisfied(self, gpu_scope: List[int], req_gpu_num: int):
73 |         ok = False
74 |         available_gpus = self.get_available_gpus_in_scope(gpu_scope)
75 |         if req_gpu_num <= len(available_gpus):
76 |             available_gpus = available_gpus[:req_gpu_num]
77 |             ok = True
78 |         return ok, available_gpus
79 | 
80 |     def __getitem__(self, index: int):
81 |         return self._is_totally_free(index)
82 | 
83 |     def __str__(self):
84 |         tmp = self.gs.jsonify()
85 |         tmp["query_time"] = str(tmp["query_time"])
86 |         return json.dumps(tmp, indent=2, ensure_ascii=False)
87 | 


--------------------------------------------------------------------------------
/watchmen/reminder.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import hashlib
 3 | import hmac
 4 | import base64
 5 | import urllib
 6 | import smtplib
 7 | from email.mime.multipart import MIMEMultipart
 8 | from email.mime.text import MIMEText
 9 | from typing import Optional, List
10 | 
11 | import requests
12 | 
13 | 
14 | def send_email(
15 |     host: str,  # email host to login, like `smtp.163.com`
16 |     port: int,  # email port to login, like `25`
17 |     user: str,  # user email address for login, like `***@163.com`
18 |     password: str,  # password or auth code for login
19 |     receiver: str,  # receiver email address
20 |     html_message: str,  # content, html format supported
21 |     subject: Optional[str] = "Notice"  # email subject
22 | ):
23 |     # set up the SMTP server
24 |     s = smtplib.SMTP(host=host, port=port)
25 |     s.starttls()
26 |     s.login(user, password)
27 | 
28 |     msg = MIMEMultipart()       # create a message
29 |     msg['From'] = user
30 |     msg['To'] = receiver
31 |     msg['Subject'] = subject
32 |     msg.attach(MIMEText(html_message, 'html'))
33 |     s.send_message(msg)
34 |     del msg
35 |     # Terminate the SMTP session and close the connection
36 |     s.quit()
37 | 
38 | 
39 | def send_dingtalk_msg(
40 |     dingtalk_user_mentions: List[str],  # which user to mention, like `[183********]`
41 |     dingtalk_secret: str,  # like SEc1f****
42 |     dingtalk_webhook_url: str,  # like `https://oapi.dingtalk.com/robot/send?access_token=***`
43 |     message: str  # message content
44 | ):
45 |     r"""
46 |     Reference:
47 |         - https://github.com/huggingface/knockknock
48 |     """
49 |     msg_template = {
50 |         "msgtype": "text",
51 |         "text": {
52 |             "content": message
53 |         },
54 |         "at": {
55 |             "atMobiles": dingtalk_user_mentions,
56 |             "isAtAll": False
57 |         }
58 |     }
59 |     '''
60 |     construct_encrypted_url
61 |     Visit https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq for details
62 |     '''
63 |     timestamp = round(datetime.datetime.now().timestamp() * 1000)
64 |     secret_enc = dingtalk_secret.encode('utf-8')
65 |     string_to_sign = '{}\n{}'.format(timestamp, dingtalk_secret)
66 |     string_to_sign_enc = string_to_sign.encode('utf-8')
67 |     hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
68 |     sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
69 |     encrypted_url = dingtalk_webhook_url + '&timestamp={}'\
70 |         .format(timestamp) + '&sign={}'.format(sign)
71 |     postto = encrypted_url
72 |     requests.post(postto, json=msg_template)
73 | 


--------------------------------------------------------------------------------
/watchmen/server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import queue
  4 | import logging
  5 | import readline  # noqa: F401
  6 | import datetime
  7 | import argparse
  8 | import threading
  9 | import secrets
 10 | from functools import wraps
 11 | 
 12 | from flask import Flask, jsonify, request, render_template, make_response, session
 13 | from flask.json.provider import DefaultJSONProvider
 14 | from apscheduler.schedulers.blocking import BlockingScheduler
 15 | 
 16 | from watchmen.listener import (
 17 |     is_single_gpu_totally_free,
 18 |     check_gpus_existence,
 19 |     check_req_gpu_num,
 20 |     GPUInfo,
 21 | )
 22 | from watchmen.client import ClientStatus, ClientMode, ClientModel, ClientCollection
 23 | 
 24 | 
 25 | apscheduler_logger = logging.getLogger("apscheduler")
 26 | apscheduler_logger.setLevel(logging.ERROR)
 27 | logger = logging.getLogger("common")
 28 | logger.setLevel(logging.INFO)
 29 | fmt = "[%(asctime)-15s]-%(levelname)s-%(filename)s-%(lineno)d-%(process)d: %(message)s"
 30 | datefmt = "%a %d %b %Y %H:%M:%S"
 31 | formatter = logging.Formatter(fmt, datefmt)
 32 | stream_handler = logging.StreamHandler(sys.stdout)
 33 | stream_handler.setFormatter(formatter)
 34 | logger.addHandler(stream_handler)
 35 | 
 36 | app = Flask("watchmen.server")
 37 | app.secret_key = os.environ.get("WATCHMEN_SECRET_KEY", secrets.token_hex(32))
 38 | gpu_queue = queue.Queue()
 39 | gpu_info = GPUInfo()
 40 | gpu_queue.put(1)
 41 | client_queue = queue.Queue()
 42 | cc = ClientCollection()
 43 | client_queue.put(1)
 44 | 
 45 | APP_PORT = None
 46 | AUTH_TOKEN = None
 47 | PID_FILE = ".watchmen_server.pid"
 48 | TOKEN_FILE = ".watchmen_server.token"
 49 | 
 50 | 
 51 | class CustomJSONProvider(DefaultJSONProvider):
 52 |     def default(self, obj):
 53 |         try:
 54 |             if isinstance(obj, datetime.datetime):
 55 |                 return obj.strftime("%Y-%m-%d %H:%M:%S")
 56 |             iterable = iter(obj)
 57 |         except TypeError:
 58 |             pass
 59 |         else:
 60 |             return list(iterable)
 61 |         return DefaultJSONProvider.default(self, obj)
 62 | 
 63 | 
 64 | app.json_provider_class = CustomJSONProvider
 65 | 
 66 | 
 67 | def load_token_from_file():
 68 |     """Load authentication token from file if it exists."""
 69 |     if os.path.exists(TOKEN_FILE):
 70 |         with open(TOKEN_FILE, "r") as f:
 71 |             return f.read().strip()
 72 |     return None
 73 | 
 74 | 
 75 | def save_token_to_file(token):
 76 |     """Save authentication token to file."""
 77 |     with open(TOKEN_FILE, "w") as f:
 78 |         f.write(token)
 79 | 
 80 | 
 81 | def generate_token():
 82 |     """Generate a random token."""
 83 |     return secrets.token_hex(16)  # 32 character hex string
 84 | 
 85 | 
 86 | def login_required(f):
 87 |     """Decorator to require authentication for a route."""
 88 | 
 89 |     @wraps(f)
 90 |     def decorated_function(*args, **kwargs):
 91 |         if not AUTH_TOKEN:
 92 |             return f(*args, **kwargs)  # No authentication required
 93 | 
 94 |         # Check session
 95 |         if session.get("authenticated"):
 96 |             return f(*args, **kwargs)
 97 | 
 98 |         # Check token in headers or query parameters
 99 |         token = request.headers.get("X-Auth-Token") or request.args.get("token")
100 |         if token and token == AUTH_TOKEN:
101 |             session["authenticated"] = True
102 |             return f(*args, **kwargs)
103 | 
104 |         return jsonify({"status": "err", "msg": "Authentication required"}), 401
105 | 
106 |     return decorated_function
107 | 
108 | 
109 | @app.route("/auth", methods=["POST"])
110 | def authenticate():
111 |     """Authenticate with token."""
112 |     if not AUTH_TOKEN:
113 |         return jsonify({"status": "ok", "msg": "No authentication required"})
114 | 
115 |     data = request.get_json()
116 |     if not data or "token" not in data:
117 |         return jsonify({"status": "err", "msg": "Token required"}), 400
118 | 
119 |     if data["token"] == AUTH_TOKEN:
120 |         session["authenticated"] = True
121 |         return jsonify({"status": "ok", "msg": "Authentication successful"})
122 |     else:
123 |         return jsonify({"status": "err", "msg": "Invalid token"}), 401
124 | 
125 | 
126 | @app.route("/gpu/<int:gpu_id>")
127 | @login_required
128 | def get_single_gpu_status(gpu_id: int):
129 |     status = ""
130 |     msg = ""
131 |     try:
132 |         msg = is_single_gpu_totally_free(gpu_id)
133 |         status = "ok"
134 |     except ValueError as err:
135 |         msg = str(err)
136 |         status = "err"
137 |     return jsonify({"status": status, "msg": msg})
138 | 
139 | 
140 | @app.route("/gpus/<gpu_ids>")
141 | @login_required
142 | def get_gpus_status(gpu_ids: str):
143 |     status = "err"
144 |     msg = ""
145 |     detail = []
146 |     try:
147 |         gpu_ids = sorted(map(int, gpu_ids.split(",")))
148 |         for gpu_id in gpu_ids:
149 |             detail.append({"gpu": gpu_id, "status": is_single_gpu_totally_free(gpu_id)})
150 |         if all(detail):
151 |             msg = True
152 |         else:
153 |             msg = False
154 |         status = "ok"
155 |     except ValueError as err:
156 |         msg = str(err)
157 |         status = "err"
158 |     return jsonify({"status": status, "msg": msg, "detail": detail})
159 | 
160 | 
161 | @app.route("/client/ping", methods=["POST"])
162 | @login_required
163 | def client_ping():
164 |     client_info = ClientModel(**request.json)
165 |     status = ""
166 |     available_gpus = []
167 |     msg = ""
168 |     client_id = client_info.id
169 |     if client_id in cc:
170 |         cc[client_id].last_request_time = datetime.datetime.now()
171 |         status = "ok"
172 |         available_gpus = cc[client_id].available_gpus
173 |         msg = cc[client_id].status
174 |     elif client_id in cc.finished_queue:
175 |         status = "ok"
176 |         available_gpus = cc.finished_queue[client_id].available_gpus
177 |         msg = cc.finished_queue[client_id].status
178 |     else:
179 |         status = "err"
180 |         msg = "client not registered or has been cancelled"
181 |     info = {"status": status, "available_gpus": available_gpus, "msg": msg}
182 |     logger.info(f"client {client_id} ping: {info}")
183 |     return jsonify(info)
184 | 
185 | 
186 | @app.route("/client/register", methods=["POST"])
187 | @login_required
188 | def client_register():
189 |     client_info = ClientModel(**request.json)
190 |     status = ""
191 |     msg = ""
192 |     if len(client_info.gpus) <= 0:
193 |         status = "err"
194 |         msg = "gpus must not be empty!"
195 |     elif not ClientMode.has_value(client_info.mode):
196 |         status = "err"
197 |         msg = f"mode {client_info.mode} is not supported"
198 |     elif not check_gpus_existence(client_info.gpus):
199 |         status = "err"
200 |         msg = "check the gpus existence"
201 |     elif client_info.mode == ClientMode.SCHEDULE and not check_req_gpu_num(
202 |         client_info.req_gpu_num
203 |     ):
204 |         status = "err"
205 |         msg = "`req_gpu_num` is not valid"
206 |     else:
207 |         if client_info.id not in cc:
208 |             client = ClientModel(
209 |                 id=client_info.id,
210 |                 mode=client_info.mode,
211 |                 status=ClientStatus.WAITING,
212 |                 register_time=datetime.datetime.now(),
213 |                 last_request_time=datetime.datetime.now(),
214 |                 queue_num=len(cc.work_queue),
215 |                 gpus=client_info.gpus,
216 |                 req_gpu_num=client_info.req_gpu_num,
217 |             )
218 |             cc.work_queue[client.id] = client
219 |             status = "ok"
220 |         else:
221 |             status = "err"
222 |             msg = f"client_id: {client_info.id} has been registered!"
223 |     return jsonify({"status": status, "msg": msg})
224 | 
225 | 
226 | @app.route("/client/cancel", methods=["POST"])
227 | @login_required
228 | def client_cancel():
229 |     status = ""
230 |     msg = ""
231 |     try:
232 |         client_id = request.json.get("id")
233 |         if client_id in cc:
234 |             client = cc[client_id]
235 |             if client.status in [ClientStatus.WAITING, ClientStatus.READY]:
236 |                 client.status = ClientStatus.CANCELLED
237 |                 cc.mark_finished(client_id)
238 |                 status = "ok"
239 |                 msg = f"Client {client_id} cancelled successfully"
240 |             else:
241 |                 status = "err"
242 |                 msg = f"Client {client_id} is not waiting"
243 |         else:
244 |             status = "err"
245 |             msg = f"Client {client_id} not found"
246 |     except Exception as err:
247 |         status = "err"
248 |         msg = str(err)
249 |     return jsonify({"status": status, "msg": msg})
250 | 
251 | 
252 | @app.route("/show/work", methods=["GET"])
253 | @login_required
254 | def show_work():
255 |     status = ""
256 |     msg = ""
257 |     try:
258 |         status = "ok"
259 |         msg = [x.dict() for x in cc.work_queue.values()]
260 |     except Exception as err:
261 |         status = "err"
262 |         msg = str(err)
263 |     return jsonify({"status": status, "msg": msg})
264 | 
265 | 
266 | @app.route("/show/finished", methods=["GET"])
267 | @login_required
268 | def show_finished():
269 |     status = ""
270 |     msg = ""
271 |     try:
272 |         status = "ok"
273 |         msg = [x.dict() for x in cc.finished_queue.values()]
274 |     except Exception as err:
275 |         status = "err"
276 |         msg = str(err)
277 |     return jsonify({"status": status, "msg": msg})
278 | 
279 | 
280 | @app.route("/show/gpus", methods=["GET"])
281 | @login_required
282 | def show_gpus():
283 |     status = ""
284 |     msg = ""
285 |     try:
286 |         status = "ok"
287 |         msg = gpu_info.gs.jsonify()
288 |         msg["query_time"] = str(msg["query_time"])
289 |     except Exception as err:
290 |         status = "err"
291 |         msg = str(err)
292 |     return jsonify({"status": status, "msg": msg})
293 | 
294 | 
295 | @app.route("/api", methods=["GET", "OPTIONS"])
296 | @login_required
297 | def api():
298 |     if request.method == "OPTIONS":
299 |         response = make_response()
300 |     else:
301 |         gpu_info = show_gpus()
302 |         gpu_msg = gpu_info.json["msg"]
303 |         work_info = show_work()
304 |         work_msg = work_info.json["msg"]
305 |         finished_info = show_finished()
306 |         finished_msg = finished_info.json["msg"]
307 |         response = jsonify(
308 |             {"gpu": gpu_msg, "work_queue": work_msg, "finished_queue": finished_msg}
309 |         )
310 |     response.headers["Access-Control-Allow-Origin"] = "*"
311 |     return response
312 | 
313 | 
314 | @app.route("/", methods=["GET"])
315 | def index():
316 |     global APP_PORT
317 |     is_authenticated = session.get("authenticated", False) or not AUTH_TOKEN
318 |     auth_required = AUTH_TOKEN is not None
319 |     return render_template(
320 |         "index.html",
321 |         port=APP_PORT,
322 |         is_authenticated=is_authenticated,
323 |         auth_required=auth_required,
324 |     )
325 | 
326 | 
327 | @app.route("/old", methods=["GET"])
328 | @login_required
329 | def old_index():
330 |     gpu_info = show_gpus()
331 |     gpu_msg = gpu_info.json["msg"]
332 |     work_info = show_work()
333 |     work_msg = work_info.json
334 |     finished_info = show_finished()
335 |     finished_msg = finished_info.json
336 |     return render_template(
337 |         "old_index.html", gpu_msg=gpu_msg, work_msg=work_msg, finished_msg=finished_msg
338 |     )
339 | 
340 | 
341 | def check_gpu_info():
342 |     gpu_info.new_query()
343 |     logger.info("check gpu info")
344 | 
345 | 
346 | def check_work(queue_timeout):
347 |     logger.info("regular check")
348 |     marked_finished = []
349 |     reserved_gpus = set()
350 |     client_list = []
351 |     queue_num = 0
352 |     for client_id, client in cc.work_queue.items():
353 |         time_delta = datetime.datetime.now() - client.last_request_time
354 |         logger.info(
355 |             f"client: {client.id}, time_delta.seconds: {time_delta.seconds}, time_delta: {time_delta}"
356 |         )
357 |         if time_delta.seconds > queue_timeout:
358 |             if client.status != ClientStatus.READY:
359 |                 client.status = ClientStatus.TIMEOUT
360 |             else:
361 |                 client.status = ClientStatus.OK
362 |             # invalid client
363 |             client.queue_num = -1
364 |             marked_finished.append(client_id)
365 |             continue
366 |         client.queue_num = queue_num
367 |         ok = False
368 |         available_gpus = []
369 |         if client.status == ClientStatus.READY:
370 |             reserved_gpus |= set(client.available_gpus)
371 |         else:
372 |             try:
373 |                 if client.mode == "queue":
374 |                     ok = gpu_info.is_gpus_available(client.gpus)
375 |                     available_gpus = client.gpus
376 |                 elif client.mode == "schedule":
377 |                     ok, available_gpus = gpu_info.is_req_gpu_num_satisfied(
378 |                         client.gpus, client.req_gpu_num
379 |                     )
380 |                 else:
381 |                     raise RuntimeError(f"Not supported mode: {client.mode}")
382 |             except IndexError as err:
383 |                 client.msg = str(err)
384 |             except ValueError as err:
385 |                 client.msg = str(err)
386 |             except RuntimeError as err:
387 |                 client.msg = str(err)
388 | 
389 |         client_list.append([client_id, client, ok, set(available_gpus)])
390 |         queue_num += 1
391 | 
392 |     # post check and assignment, and make sure gpus of `ready` clients will not be assigned to the others
393 |     for client_id, client, ok, available_gpu_set in client_list:
394 |         if (
395 |             ok
396 |             and len(available_gpu_set) > 0
397 |             and len(available_gpu_set & reserved_gpus) < 1
398 |         ):
399 |             client.status = ClientStatus.READY
400 |             client.available_gpus = available_gpus
401 |             reserved_gpus |= set(client.available_gpus)
402 |             logger.info(
403 |                 f"client: {client.id} is ready, available gpus: {client.available_gpus}"
404 |             )
405 | 
406 |     for client_id in marked_finished:
407 |         logger.info(f"client {client.id} marked as finished, status: {client.status}")
408 |         cc.mark_finished(client_id)
409 | 
410 | 
411 | def check_finished(status_queue_keep_time):
412 |     logger.info("check out-dated finished clients")
413 |     marked_delete_ids = []
414 |     for client_id, client in cc.finished_queue.items():
415 |         delta = datetime.datetime.now() - client.last_request_time
416 |         if (delta.days * 24 + delta.seconds / 3600) >= status_queue_keep_time:
417 |             marked_delete_ids.append(client_id)
418 |     for client_id in marked_delete_ids:
419 |         cc.finished_queue.pop(client_id)
420 |         logger.info(f"remove {client.id} from finished queue")
421 | 
422 | 
423 | def regular_check(request_interval, queue_timeout, status_queue_keep_time):
424 |     scheduler = BlockingScheduler(logger=apscheduler_logger)
425 |     scheduler.add_job(
426 |         check_gpu_info,
427 |         trigger="interval",
428 |         seconds=request_interval,
429 |         next_run_time=datetime.datetime.now(),
430 |     )
431 |     scheduler.add_job(
432 |         check_work,
433 |         trigger="interval",
434 |         seconds=request_interval * 5,
435 |         args=(queue_timeout,),
436 |         next_run_time=datetime.datetime.now(),
437 |     )
438 |     if status_queue_keep_time != -1:
439 |         scheduler.add_job(
440 |             check_finished,
441 |             trigger="interval",
442 |             hours=status_queue_keep_time,
443 |             args=(status_queue_keep_time,),
444 |             next_run_time=datetime.datetime.now(),
445 |         )
446 |     scheduler.start()
447 | 
448 | 
449 | def api_server(host, port):
450 |     global APP_PORT
451 |     APP_PORT = port
452 |     app.run(host=host, port=port)
453 | 
454 | 
455 | if __name__ == "__main__":
456 |     parser = argparse.ArgumentParser()
457 |     parser.add_argument(
458 |         "--host", type=str, default="0.0.0.0", help="host address for api server"
459 |     )
460 |     parser.add_argument("--port", type=str, default=62333, help="port for api server")
461 |     parser.add_argument(
462 |         "--queue_timeout",
463 |         type=int,
464 |         default=300,
465 |         help="timeout for queue waiting (seconds)",
466 |     )
467 |     parser.add_argument(
468 |         "--request_interval",
469 |         type=int,
470 |         default=1,
471 |         help="interval for gpu status requesting (seconds)",
472 |     )
473 |     parser.add_argument(
474 |         "--status_queue_keep_time",
475 |         type=int,
476 |         default=48,
477 |         help=(
478 |             "hours for keeping the client status. "
479 |             "set `-1` to keep all clients' status"
480 |         ),
481 |     )
482 |     parser.add_argument(
483 |         "--token",
484 |         type=str,
485 |         default="",
486 |         help="Authentication token for accessing the web interface. If empty, a token will be generated. Set to 'none' to disable authentication.",
487 |     )
488 |     args = parser.parse_args()
489 | 
490 |     # Handle token authentication
491 |     if args.token.lower() == "none":
492 |         AUTH_TOKEN = None
493 |         if os.path.exists(TOKEN_FILE):
494 |             os.remove(TOKEN_FILE)  # Remove token file if auth is disabled
495 |         logger.info("Authentication disabled")
496 |     else:
497 |         # Check for token from command line, file, or generate a new one
498 |         if args.token:
499 |             AUTH_TOKEN = args.token
500 |             save_token_to_file(AUTH_TOKEN)
501 |         else:
502 |             AUTH_TOKEN = load_token_from_file()
503 |             if not AUTH_TOKEN:
504 |                 AUTH_TOKEN = generate_token()
505 |                 save_token_to_file(AUTH_TOKEN)
506 | 
507 |         logger.info(f"Authentication enabled with token: {AUTH_TOKEN}")
508 |         logger.info(f"Token saved to {os.path.abspath(TOKEN_FILE)}")
509 | 
510 |     logger.info(f"Running at: {args.host}:{args.port}")
511 |     logger.info(f"Current pid: {os.getpid()} > {PID_FILE}")
512 |     with open(PID_FILE, "wt", encoding="utf-8") as fout:
513 |         fout.write(f"{os.getpid()}")
514 | 
515 |     # daemon threads will end automaticly if the main thread ends
516 |     # thread 1: check gpu and client info regularly
517 |     check_worker = threading.Thread(
518 |         name="check",
519 |         target=regular_check,
520 |         args=(args.request_interval, args.queue_timeout, args.status_queue_keep_time),
521 |         daemon=True,
522 |     )
523 | 
524 |     # thread 2: main server api backend
525 |     api_server_worker = threading.Thread(
526 |         name="api", target=api_server, args=(args.host, args.port), daemon=True
527 |     )
528 | 
529 |     check_worker.start()
530 |     logger.info("check worker started")
531 |     api_server_worker.start()
532 |     logger.info("api server started")
533 | 
534 |     while True:
535 |         try:
536 |             if not check_worker.is_alive():
537 |                 logger.error("check worker is not alive, server quit")
538 |                 raise RuntimeError("check worker is not alive, server quit")
539 |             if not api_server_worker.is_alive():
540 |                 logger.error("api server worker is not alive, server quit")
541 |                 raise RuntimeError("api server worker is not alive, server quit")
542 |         except RuntimeError:
543 |             logger.error("runtime error, kill the server")
544 |             break
545 |         except KeyboardInterrupt:
546 |             logger.error("keyboard interrupted, kill the server")
547 |             break
548 |     logger.error("bye")
549 | 


--------------------------------------------------------------------------------
/watchmen/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="UTF-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   <title>Watchmen</title>
  8 |   <style>
  9 |     body {
 10 |       margin: 0%;
 11 |     }
 12 | 
 13 |     #navbar {
 14 |       overflow: hidden;
 15 |       background-color: #333;
 16 |     }
 17 | 
 18 |     #navbar a {
 19 |       float: left;
 20 |       display: block;
 21 |       color: #f2f2f2;
 22 |       text-align: center;
 23 |       padding: 14px;
 24 |       text-decoration: none;
 25 |     }
 26 | 
 27 |     .bold {
 28 |       font-weight: bold;
 29 |     }
 30 | 
 31 |     .selected {
 32 |       background-color: rgb(110, 110, 110);
 33 |     }
 34 | 
 35 |     .content {
 36 |       padding: 16px;
 37 |     }
 38 | 
 39 |     .add-margin {
 40 |       margin-top: 100px;
 41 |     }
 42 | 
 43 |     .sticky {
 44 |       position: fixed;
 45 |       top: 0;
 46 |       width: 100%;
 47 |     }
 48 | 
 49 |     .container {
 50 |       display: flex;
 51 |     }
 52 | 
 53 |     .container span {
 54 |       padding: 10px;
 55 |     }
 56 | 
 57 |     .styled-table {
 58 |       border-radius: 5px;
 59 |       border-collapse: collapse;
 60 |       border-style: hidden;
 61 |       /* hide standard table (collapsed) border */
 62 |       box-shadow: 0 0 0 1px #3d3d3d;
 63 |       /* this draws the table border  */
 64 |       margin: 25px 0;
 65 |       min-width: 400px;
 66 |     }
 67 | 
 68 |     .styled-table thead tr {
 69 |       background-color: #3d3d3d;
 70 |       color: #ffffff;
 71 |     }
 72 | 
 73 |     .styled-table th,
 74 |     .styled-table td {
 75 |       padding: 12px 15px;
 76 |       border: 1px solid #ccc;
 77 |     }
 78 | 
 79 |     .right-align {
 80 |       text-align: right;
 81 |     }
 82 | 
 83 |     .green {
 84 |       color: green;
 85 |     }
 86 | 
 87 |     .red {
 88 |       color: red;
 89 |     }
 90 | 
 91 |     .large-symbol {
 92 |       font-size: x-large;
 93 |     }
 94 | 
 95 |     .waiting {
 96 |       color: black;
 97 |     }
 98 | 
 99 |     .timeout {
100 |       color: red;
101 |     }
102 | 
103 |     .ok {
104 |       color: green;
105 |     }
106 | 
107 |     .ready {
108 |       color: green;
109 |     }
110 |   </style>
111 | </head>
112 | 
113 | <body>
114 |   <div id="navbar" class="sticky">
115 |     <a id="homepage" class="bold">Watchmen GPU Scheduler</a>
116 |     <a id="working-queue">Working Queue</a>
117 |     <a id="finished-queue">Finished Queue</a>
118 |   </div>
119 | 
120 |   <div id="main-body" class="content">
121 |     <div class="container">
122 |       <span class="bold">Connection: </span>
123 |       <span id="connection"></span>
124 |       <button id="update-button">Update</button>
125 |     </div>
126 | 
127 |     <div class="container">
128 |       <span class="bold">Host Name: </span>
129 |       <span id="hostname"></span>
130 |     </div>
131 | 
132 |     <div class="container">
133 |       <span class="bold">Query Time: </span>
134 |       <span id="query-time"></span>
135 |     </div>
136 | 
137 |     <div id="content"></div>
138 |   </div>
139 | 
140 |   <!-- <pre id="result"></pre> -->
141 | 
142 |   <template id="homepage-template">
143 |     <table class="styled-table">
144 |       <thead>
145 |         <tr>
146 |           <th>No.</th>
147 |           <th>Name</th>
148 |           <th>Temp (℃)</th>
149 |           <th>Util</th>
150 |           <th>Memory</th>
151 |           <th>Process ( User: PID (Memory) )</th>
152 |         </tr>
153 |       </thead>
154 |       <tbody id="gpu-usage"></tbody>
155 |     </table>
156 |   </template>
157 | 
158 |   <template id="working-queue-template">
159 |     <table class="styled-table">
160 |       <thead>
161 |         <tr>
162 |           <th>Queue Num</th>
163 |           <th>Status</th>
164 |           <th>ID</th>
165 |           <th>Mode</th>
166 |           <th>GPU Scope</th>
167 |           <th>Request GPU Num</th>
168 |           <th>Available GPUs</th>
169 |           <th>Register Time</th>
170 |           <th>Last Request Time</th>
171 |           <th>Action</th>
172 |         </tr>
173 |       </thead>
174 |       <tbody id="working-queue-stats"></tbody>
175 |     </table>
176 |   </template>
177 | 
178 |   <template id="finished-queue-template">
179 |     <table class="styled-table">
180 |       <thead>
181 |         <tr>
182 |           <th>Queue Num</th>
183 |           <th>Status</th>
184 |           <th>ID</th>
185 |           <th>Mode</th>
186 |           <th>GPU Scope</th>
187 |           <th>Request GPU Num</th>
188 |           <th>Available GPUs</th>
189 |           <th>Register Time</th>
190 |           <th>Last Request Time</th>
191 |         </tr>
192 |       </thead>
193 |       <tbody id="finished-queue-stats"></tbody>
194 |     </table>
195 |   </template>
196 | 
197 | 
198 |   <script type="text/javascript">
199 |     var port = {{ port }}
200 |     var responseJsonData = null
201 |     var selected = "homepage-template"
202 |     var isAuthenticated = {{ is_authenticated|tojson }}
203 |     var authRequired = {{ auth_required|tojson }}
204 | 
205 |     // const result = document.getElementById("result")
206 |     const connection = document.getElementById("connection")
207 |     const updateButton = document.getElementById("update-button")
208 |     const navbar = document.getElementById("navbar")
209 |     const mainBody = document.getElementById('main-body')
210 |     const content = document.getElementById('content')
211 |     const homepageElem = document.getElementById("homepage")
212 |     const workingElem = document.getElementById("working-queue")
213 |     const finishedElem = document.getElementById("finished-queue")
214 | 
215 |     window.addEventListener('DOMContentLoaded', async () => {
216 |       mainBody.style.marginTop = `${navbar.offsetHeight}px`
217 |       updateTemplate()
218 |       
219 |       // Check authentication status
220 |       if (authRequired && !isAuthenticated) {
221 |         promptForToken()
222 |       } else {
223 |         await getInfoAndUpdate()
224 |       }
225 |     })
226 |     
227 |     function promptForToken() {
228 |       const token = prompt("Please enter your authentication token:", "")
229 |       if (token) {
230 |         authenticateWithToken(token)
231 |       }
232 |     }
233 |     
234 |     async function authenticateWithToken(token) {
235 |       try {
236 |         const response = await fetch(`http://${window.location.hostname}:${port}/auth`, {
237 |           method: 'POST',
238 |           headers: {
239 |             'Content-Type': 'application/json',
240 |           },
241 |           body: JSON.stringify({ token: token }),
242 |           credentials: 'same-origin'
243 |         })
244 |         
245 |         const data = await response.json()
246 |         if (data.status === "ok") {
247 |           isAuthenticated = true
248 |           await getInfoAndUpdate()
249 |         } else {
250 |           alert("Invalid token. Please try again.")
251 |           promptForToken()
252 |         }
253 |       } catch (error) {
254 |         console.error('Authentication error:', error)
255 |         alert(`Authentication error: ${error.message}`)
256 |         promptForToken()
257 |       }
258 |     }
259 | 
260 |     function updateTemplate(useCacheData = true) {
261 |       const template = document.getElementById(selected);
262 |       const view = template.content.cloneNode(true);
263 |       content.innerHTML = '';
264 |       content.appendChild(view);
265 | 
266 |       if (useCacheData && responseJsonData !== null) {
267 |         updateFrame(responseJsonData)
268 |       }
269 |     }
270 | 
271 |     function updateFrame(data) {
272 |       updateTemplate(false)
273 |       const hostnameElem = document.getElementById("hostname")
274 |       const queryTimeElem = document.getElementById("query-time")
275 |       hostnameElem.innerText = data.gpu.hostname
276 |       queryTimeElem.innerText = data.gpu.query_time
277 | 
278 |       if (selected === "homepage-template") {
279 |         const gpuUsageElem = document.getElementById("gpu-usage")
280 |         for (let i = 0; i < data.gpu.gpus.length; i++) {
281 |           let gpu = data.gpu.gpus[i]
282 |           let tr = document.createElement("tr")
283 |           let processes = []
284 |           for (let process of gpu.processes) {
285 |             processes.push(`${process.username}:${process.pid}(${process.gpu_memory_usage})`)
286 |           }
287 |           tr.innerHTML = `<td>${gpu.index}</td> <td>${gpu.name}</td> <td class="right-align">${gpu["temperature.gpu"]}</td> <td class="right-align">${gpu["utilization.gpu"]}</td> <td class="right-align">${gpu["memory.used"]} / ${gpu["memory.total"]}</td> <td>${processes}</td>`
288 |           gpuUsageElem.appendChild(tr)
289 |         }
290 |       } else if (selected === "working-queue-template") {
291 |         const workingStats = document.getElementById("working-queue-stats")
292 |         for (let i = 0; i < data.work_queue.length; i++) {
293 |           let c = data.work_queue[i]
294 |           let tr = document.createElement("tr")
295 |           let cancelButton = `<button class="cancel-btn" data-client-id="${c.id}">Cancel</button>`
296 |           tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.available_gpus}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td> <td>${cancelButton}</td>`
297 |           workingStats.appendChild(tr)
298 |         }
299 | 
300 |         // Add event listeners to cancel buttons
301 |         document.querySelectorAll('.cancel-btn').forEach(button => {
302 |           button.addEventListener('click', async (e) => {
303 |             const clientId = e.target.dataset.clientId
304 |             await cancelClient(clientId)
305 |           })
306 |         })
307 |       } else if (selected === "finished-queue-template") {
308 |         const finishedStats = document.getElementById("finished-queue-stats")
309 |         for (let i = 0; i < data.finished_queue.length; i++) {
310 |           let c = data.finished_queue[i]
311 |           let tr = document.createElement("tr")
312 |           tr.innerHTML = `<td>${c.queue_num}</td> <td class="${c.status}">${c.status}</td> <td>${c.id}</td> <td>${c.mode}</td> <td>${c.gpus}</td> <td>${c.req_gpu_num}</td> <td>${c.available_gpus}</td> <td>${c.register_time}</td> <td>${c.last_request_time}</td>`
313 |           finishedStats.appendChild(tr)
314 |         }
315 |       }
316 |     }
317 | 
318 |     function timeout(ms, promise) {
319 |       return new Promise((resolve, reject) => {
320 |         const timer = setTimeout(() => {
321 |           reject(new Error('TIMEOUT'))
322 |         }, ms)
323 | 
324 |         promise
325 |           .then(value => {
326 |             clearTimeout(timer)
327 |             resolve(value)
328 |           })
329 |           .catch(reason => {
330 |             clearTimeout(timer)
331 |             reject(reason)
332 |           })
333 |       })
334 |     }
335 | 
336 |     async function getInfoAndUpdate() {
337 |       await timeout(30000, fetch(`http://${window.location.hostname}:${port}/api`, {
338 |         credentials: 'same-origin'
339 |       }))
340 |         .then(async (response) => {
341 |           if (!response.ok) {
342 |             if (response.status === 401) {
343 |               // Authentication required
344 |               isAuthenticated = false
345 |               promptForToken()
346 |               return
347 |             }
348 |             connection.classList.remove("green")
349 |             connection.classList.add("red")
350 |             connection.innerText = "■ ERROR"
351 |           } else {
352 |             let responseJson = await response.json()
353 |             connection.classList.remove("red")
354 |             connection.classList.add("green")
355 |             connection.innerText = "● OK"
356 |             responseJsonData = responseJson
357 |             updateFrame(responseJson)
358 |           }
359 |         })
360 |         .catch(err => {
361 |           console.error(err)
362 |           connection.classList.remove("green")
363 |           connection.classList.add("red")
364 |           connection.innerText = "■ ERROR"
365 |         })
366 |     }
367 | 
368 |     setInterval(getInfoAndUpdate, 60000)
369 | 
370 |     homepageElem.addEventListener("click", (e) => {
371 |       selected = "homepage-template"
372 |       workingElem.classList.remove("selected")
373 |       finishedElem.classList.remove("selected")
374 |       updateTemplate()
375 |     })
376 | 
377 |     workingElem.addEventListener("click", (e) => {
378 |       selected = "working-queue-template"
379 |       workingElem.classList.add("selected")
380 |       finishedElem.classList.remove("selected")
381 |       updateTemplate()
382 |     })
383 | 
384 |     finishedElem.addEventListener("click", (e) => {
385 |       selected = "finished-queue-template"
386 |       workingElem.classList.remove("selected")
387 |       finishedElem.classList.add("selected")
388 |       updateTemplate()
389 |     })
390 | 
391 | 
392 |     window.addEventListener('resize', (e) => {
393 |       mainBody.style.marginTop = `${navbar.offsetHeight}px`
394 |     })
395 | 
396 |     updateButton.addEventListener('click', async (e) => {
397 |       await getInfoAndUpdate()
398 |     })
399 | 
400 |     async function cancelClient(clientId) {
401 |       try {
402 |         const response = await fetch(`http://${window.location.hostname}:${port}/client/cancel`, {
403 |           method: 'POST',
404 |           headers: {
405 |             'Content-Type': 'application/json',
406 |           },
407 |           body: JSON.stringify({ id: clientId }),
408 |           credentials: 'same-origin'
409 |         })
410 | 
411 |         if (response.status === 401) {
412 |           // Authentication required
413 |           isAuthenticated = false
414 |           promptForToken()
415 |           return
416 |         }
417 | 
418 |         const data = await response.json()
419 |         if (data.status === "ok") {
420 |           alert(`Client ${clientId} cancelled successfully`)
421 |           // Refresh the data
422 |           await getInfoAndUpdate()
423 |         } else {
424 |           alert(`Error: ${data.msg}`)
425 |         }
426 |       } catch (error) {
427 |         console.error('Error cancelling client:', error)
428 |         alert(`Error cancelling client: ${error.message}`)
429 |       }
430 |     }
431 |   </script>
432 | </body>
433 | 
434 | </html>


--------------------------------------------------------------------------------
/watchmen/templates/old_index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="UTF-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   <title>Watchmen</title>
  8 | </head>
  9 | 
 10 | <body>
 11 |   <h1>Watchmen GPU Scheduler</h1>
 12 |   <div>
 13 |     <h2>GPU Status</h2>
 14 |     {% if 'hostname' in gpu_msg %}
 15 |     <table border="2">
 16 |       <p>{{ gpu_msg.hostname }} : {{ gpu_msg.query_time }}</p>
 17 |       <thead>
 18 |         <tr>
 19 |           <th>Index</th>
 20 |           <th>Name</th>
 21 |           <th>Temp</th>
 22 |           <th>Util</th>
 23 |           <th>Memory</th>
 24 |           <th>#Process</th>
 25 |         </tr>
 26 |       </thead>
 27 |       <tbody>
 28 |         {% for gpu in gpu_msg.gpus %}
 29 |         <tr>
 30 |           <td>{{ gpu.index }}</td>
 31 |           <td>{{ gpu.name }}</td>
 32 |           <td>{{ gpu['temperature.gpu'] }}</td>
 33 |           <td>{{ gpu['utilization.gpu'] }}</td>
 34 |           <td>{{ gpu['memory.used'] }} / {{ gpu['memory.total'] }}</td>
 35 |           <td>{{ gpu.processes|length }}</td>
 36 |         </tr>
 37 |         {% endfor %}
 38 |       </tbody>
 39 |     </table>
 40 |     {% else %}
 41 |     <p>{{ gpu_msg }}</p>
 42 |     {% endif %}
 43 |   </div>
 44 |   <div>
 45 |     <h2>Working Queue Status</h2>
 46 |     {% if work_msg.status == "ok" %}
 47 |     <table border="2">
 48 |       <thead>
 49 |         <tr>
 50 |           <th>ID</th>
 51 |           <th>Mode</th>
 52 |           <th>GPU Scope</th>
 53 |           <th>Request GPU Num</th>
 54 |           <th>Queue Num</th>
 55 |           <th>Last Request Time</th>
 56 |           <th>Status</th>
 57 |         </tr>
 58 |       </thead>
 59 |       <tbody>
 60 |         {% for client in work_msg.msg %}
 61 |         <tr>
 62 |           <td>{{ client.id }}</td>
 63 |           <td>{{ client.mode }}</td>
 64 |           <td>{{ client.gpus }}</td>
 65 |           <td>{{ client.req_gpu_num }}</td>
 66 |           <td>{{ client.queue_num }}</td>
 67 |           <td>{{ client.last_request_time }}</td>
 68 |           <td>
 69 |             {% if client.status == "ok" %}
 70 |             <p style="color:green">OK</p>
 71 |             {% elif client.status == "waiting" %}
 72 |             <p>Waiting</p>
 73 |             {% elif client.status == "timeout" %}
 74 |             <p style="color:red">Timeout</p>
 75 |             {% endif %}
 76 |           </td>
 77 |         </tr>
 78 |         {% endfor %}
 79 |       </tbody>
 80 |     </table>
 81 |     {% else %}
 82 |     <p>{{ work_msg }}</p>
 83 |     {% endif %}
 84 |   </div>
 85 |   <div>
 86 |     <h2>Finished Queue Status</h2>
 87 |     {% if finished_msg.status == "ok" %}
 88 |     <table border="2">
 89 |       <thead>
 90 |         <tr>
 91 |           <th>ID</th>
 92 |           <th>Mode</th>
 93 |           <th>Request GPU Num</th>
 94 |           <th>GPU Scope</th>
 95 |           <th>Available GPUs</th>
 96 |           <th>Queue Num</th>
 97 |           <th>Last Request Time</th>
 98 |           <th>Status</th>
 99 |         </tr>
100 |       </thead>
101 |       <tbody>
102 |         {% for client in finished_msg.msg %}
103 |         <tr>
104 |           <td>{{ client.id }}</td>
105 |           <td>{{ client.mode }}</td>
106 |           <td>{{ client.req_gpu_num }}</td>
107 |           <td>{{ client.gpus }}</td>
108 |           <td>{{ client.available_gpus }}</td>
109 |           <td>{{ client.queue_num }}</td>
110 |           <td>{{ client.last_request_time }}</td>
111 |           <td>
112 |             {% if client.status == "ok" %}
113 |             <p style="color:green">OK</p>
114 |             {% elif client.status == "waiting" %}
115 |             <p>Waiting</p>
116 |             {% elif client.status == "timeout" %}
117 |             <p style="color:red">Timeout</p>
118 |             {% endif %}
119 |           </td>
120 |         </tr>
121 |         {% endfor %}
122 |       </tbody>
123 |     </table>
124 |     {% else %}
125 |     <p>{{ finished_msg }}</p>
126 |     {% endif %}
127 |   </div>
128 | </body>
129 | 
130 | </html>


--------------------------------------------------------------------------------
/watchmen/wait.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import string
 4 | import sys
 5 | 
 6 | from watchmen import WatchClient
 7 | 
 8 | 
 9 | def parse_args(in_args=None):
10 |     arg_parser = argparse.ArgumentParser()
11 |     arg_parser.add_argument("--task_name", type=str, required=True, help="Take Name")
12 |     arg_parser.add_argument("--cuda", type=str, required=True, help="cuda to be waited")
13 |     arg_parser.add_argument(
14 |         "--req_gpu_num",
15 |         type=int,
16 |         required=False,
17 |         default=1,
18 |         help="request number of gpus",
19 |     )
20 |     arg_parser.add_argument(
21 |         "--wait",
22 |         choices=["schedule", "queue", "none"],
23 |         default="none",
24 |         help="scheduling/queue wait",
25 |     )
26 |     arg_parser.add_argument(
27 |         "--token",
28 |         type=str,
29 |         default="",
30 |         help="authentication token",
31 |     )
32 |     arg_parser.add_argument(
33 |         "--host",
34 |         type=str,
35 |         default="127.0.0.1",
36 |         help="host",
37 |     )
38 |     arg_parser.add_argument(
39 |         "--port",
40 |         type=int,
41 |         default=62333,
42 |         help="port",
43 |     )
44 |     arg_info = arg_parser.parse_args(args=in_args)
45 |     return arg_info
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     in_argv = parse_args()
50 |     if in_argv.wait == "none":
51 |         sys.exit(0)
52 |     random_id = "-" + "".join(random.sample(string.ascii_letters + string.digits, 8))
53 |     exp_id = in_argv.task_name + random_id
54 |     watch_client = WatchClient(
55 |         id=exp_id,
56 |         gpus=eval(f"[{in_argv.cuda}]"),
57 |         server_host=in_argv.host,
58 |         server_port=in_argv.port,
59 |         req_gpu_num=in_argv.req_gpu_num,
60 |         mode=in_argv.wait,
61 |         timeout=60,
62 |         token=in_argv.token,
63 |     )
64 |     available_gpus = watch_client.wait()
65 |     available_gpus = [str(x) for x in available_gpus]
66 |     print(",".join(available_gpus), end="")
67 | 


--------------------------------------------------------------------------------
/working_queue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spico197/watchmen/1075c5b3b32e65fb4692b4dfea3c768223a345cd/working_queue.png


--------------------------------------------------------------------------------