├── .circleci └── config.yml ├── .deepsource.toml ├── .doctrees ├── .eggs │ ├── requests-2.19.1-py3.6.egg │ │ └── EGG-INFO │ │ │ └── DESCRIPTION.doctree │ └── urllib3-1.23-py3.6.egg │ │ └── EGG-INFO │ │ └── DESCRIPTION.doctree ├── environment.pickle └── index.doctree ├── .gitignore ├── LICENSE ├── README.md ├── benchmarks ├── README.md ├── mpi_two_machines.py ├── pytorch_two_machines.py ├── ray_ps.py ├── ray_two_machines.py ├── ray_two_machines_local.py ├── requirements.txt ├── summary.txt ├── tf_two_machines.py ├── tf_two_machines_local.py └── util.py ├── examples ├── deleteme.py ├── gpubox.py ├── gpubox_jupyter_notebook_config.py ├── gpubox_sample.ipynb ├── launch_16_instances.py ├── ray_example.py ├── requirements.txt ├── simple_job.py ├── simple_task.py ├── simple_tf.py ├── tf_adder.py └── tf_adder_tb.py ├── ncluster ├── __init__.py ├── _version.py ├── aws_backend.py ├── aws_util.py ├── local_backend.py ├── ncluster_cloud_setup.py ├── ncluster_cloud_wipe.py ├── ncluster_globals.py ├── old_backend.py ├── summary.txt ├── test.py └── util.py ├── requirements.txt ├── requirements_benchmarks.txt ├── requirements_test.txt ├── setup.cfg ├── setup.py ├── tests ├── integration_test.py ├── join_test.py ├── logdir_test.py ├── many_commands_test.py └── run_test.py └── tools ├── ncluster └── nsync /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` 11 | - image: circleci/python:3.6.1 12 | 13 | # Specify service dependencies here if necessary 14 | # CircleCI maintains a library of pre-built images 15 | # documented at https://circleci.com/docs/2.0/circleci-images/ 16 | # - image: circleci/postgres:9.4 17 | 18 | working_directory: ~/repo 19 | 20 | steps: 21 | - checkout 22 | 23 | # Download and cache dependencies 24 | - restore_cache: 25 | keys: 26 | - v1-dependencies-{{ checksum "requirements.txt" }} 27 | # fallback to using the latest cache if no exact match is found 28 | - v1-dependencies- 29 | 30 | - run: 31 | name: install dependencies 32 | command: | 33 | python3 -m venv venv 34 | . venv/bin/activate 35 | pip install -r requirements.txt 36 | pip install -U ncluster 37 | 38 | - save_cache: 39 | paths: 40 | - ./venv 41 | key: v1-dependencies-{{ checksum "requirements.txt" }} 42 | 43 | # run tests! 44 | # this example uses Django's built-in test-runner 45 | # other common Python testing frameworks include pytest and nose 46 | # https://pytest.org 47 | # https://nose.readthedocs.io 48 | - run: 49 | name: run tests 50 | command: | 51 | . venv/bin/activate 52 | echo 'hello' 53 | python tests/integration_test.py 54 | 55 | - store_artifacts: 56 | path: test-reports 57 | destination: test-reports 58 | -------------------------------------------------------------------------------- /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | test_patterns = [ 4 | 5 | ] 6 | 7 | exclude_patterns = [ 8 | 9 | ] 10 | 11 | [[analyzers]] 12 | name = 'python' 13 | enabled = true 14 | runtime_version = '3.x.x' 15 | -------------------------------------------------------------------------------- /.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree -------------------------------------------------------------------------------- /.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree -------------------------------------------------------------------------------- /.doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/environment.pickle -------------------------------------------------------------------------------- /.doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/index.doctree -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dist 2 | /build 3 | /.DS_Store 4 | /ncluster.egg-info 5 | /ncluster/__pycache__ 6 | /.eggs 7 | /ncluster/.idea 8 | /.idea 9 | __pycache__ 10 | /.pytest_cache 11 | *.py# 12 | /ncluster/.DS_Store 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2018] [Yaroslav Bulatov] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ncluster 2 | By Yaroslav Bulatov, Andrew Shaw, Ben Mann 3 | https://github.com/cybertronai/ncluster 4 | 5 | Ncluster provides Python API to do the following things: 6 | - Allocate AWS machine 7 | - Upload file to machine 8 | - Run command on machine 9 | - Download file from machine 10 | 11 | IE 12 | 13 | ``` 14 | import ncluster 15 | task = ncluster.make_task(instance_type='p2.xlarge') 16 | task.upload('myscript.py') 17 | task.run('python myscript.py > out') 18 | task.download('out') 19 | ``` 20 | 21 | Necessary AWS infrastructure is created on demand using defaults optimal for fast prototyping. IE, your machines are preconfigured for passwordless SSH, can access each other over all interfaces, and have a persistent file system mounted under /ncluster. Commands are executed in a remote tmux session so you can take over the environment at any time and continue from your terminal. 22 | 23 | 24 | ## Installation 25 | Install pip, tmux, Python 3.6 (see below), and [write down](https://docs.google.com/document/d/1Z8lCZVWXs7XORbiNmBAsBDtouV3KwrtH8-UL5M-zHus/edit) your AWS security keys, then 26 | 27 | ``` 28 | pip install -r https://raw.githubusercontent.com/yaroslavvb/ncluster/master/requirements.txt 29 | pip install -U ncluster # `pip install -e .` to install from a local clone 30 | export AWS_ACCESS_KEY_ID=AKIAIBATdf343 31 | export AWS_SECRET_ACCESS_KEY=z7yKEP/RhO3Olk343aiP 32 | export AWS_DEFAULT_REGION=us-east-1 33 | ``` 34 | 35 | 36 | 37 | ## Command-line tools 38 | 39 | ``` 40 | ncluster 41 | ncluster ls 42 | 43 | # bring up machine t2.nano with default AMI 44 | ncluster launch --name testtest --instance_type t2.nano 45 | 46 | # kill the machine 47 | ncluster kill testtest 48 | 49 | # list machinens 50 | ncluster ls 51 | ncluster ls 52 | 53 | 54 | ncluster ssh # connects to latest instance 55 | ncluster ssh # connects to latest instance containing 56 | ncluster ssh \'\' 57 | ncluster mosh 58 | ncluster kill # terminates matching instances 59 | ncluster kill \'\' 60 | ncluster stop # stops matching instances 61 | ncluster start # starts matching stopped instances 62 | ncluster nano # starts a tiny instance 63 | ncluster keys # information on enabling SSH access for your team-members 64 | 65 | ncluster ssh_ # like ssh but works on dumb terminals 66 | ncluster ls 67 | ncluster cat 68 | ncluster cmd "some command to run remotely on AWS" 69 | 70 | ncluster efs # gives EFS info such as the mount command 71 | 72 | nsync -m gpubox 73 | nsync -m gpubox -d transformer-xl 74 | 75 | nsync -d {target directory} -m {machine name substring} 76 | 77 | nsync -m gpubox # syncs . to ~ on gpubox 78 | nsync -d transformer-xl -m 4gpubox # syncs . to ~/transformer-xl on 4gpubox 79 | 80 | ncluster hosts 81 | 82 | 83 | {substring} selects the most recently launched instances whose name contains the substring. Empty string is a valid substring. Skipping -t will sync to ~ on remote machine. Sync seems to be 1 way (from local -> remote) 84 | ``` 85 | 86 | ## Docs 87 | - Some out-of-date docs with more info [docs](https://docs.google.com/document/d/178ITRCAkboHoOEZFnz9XvOsc8lXik6Acz_DS_V1u8hY/edit?usp=sharing) 88 | 89 | ### Extra 90 | An example of installing pip/tmux/python 3.6 on MacOS 91 | 92 | 1. Download Anaconda distribution following https://conda.io/docs/user-guide/install/index.html 93 | 2. Install tmux through homebrew: https://brew.sh/, then `brew install tmux` 94 | 95 | Then 96 | 97 | ``` 98 | conda create -n new python=3.6 -y 99 | conda activate new 100 | ``` 101 | 102 | Extra Deps: 103 | ``` 104 | brew install fswatch 105 | ``` 106 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | 3 | ``` 4 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt 5 | pip install ncluster 6 | python 7 | ``` 8 | 9 | 10 | # Debugging 11 | ``` 12 | export NCLUSTER_INSTANCE=c5.18xlarge 13 | export NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE=1 14 | ``` 15 | -------------------------------------------------------------------------------- /benchmarks/mpi_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Running locally 5 | 6 | 004/11 sent 100 MBs in 28.4 ms: 3519.33 MB/second 7 | 005/11 sent 100 MBs in 25.1 ms: 3988.50 MB/second 8 | 006/11 sent 100 MBs in 25.5 ms: 3918.33 MB/second 9 | 007/11 sent 100 MBs in 25.3 ms: 3958.61 MB/second 10 | 008/11 sent 100 MBs in 25.3 ms: 3954.15 MB/second 11 | 009/11 sent 100 MBs in 24.9 ms: 4009.78 MB/second 12 | 010/11 sent 100 MBs in 25.0 ms: 3992.75 MB/second 13 | min: 24.94, median: 25.52, mean: 29.53 14 | 15 | 16 | """ 17 | 18 | import argparse 19 | import json 20 | import os 21 | import numpy as np 22 | import tensorflow as tf 23 | import time 24 | 25 | import util 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--iters", default=11, type=int, 29 | help="Maximum number of additions") 30 | parser.add_argument("--size-mb", default=100, type=int, 31 | help="size of vector in MBs") 32 | parser.add_argument("--shards", default=1, type=int, 33 | help="how many ways to shard the variable") 34 | parser.add_argument('--image', 35 | default='Deep Learning AMI (Ubuntu) Version 22.0') 36 | parser.add_argument('--instance_type', type=str, default='') 37 | parser.add_argument('--name', 38 | default='mpi') 39 | 40 | # internal flags 41 | parser.add_argument('--role', default='launcher', type=str) 42 | args = parser.parse_args() 43 | 44 | 45 | def run_launcher(): 46 | import ncluster 47 | 48 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image, 49 | instance_type=args.instance_type) 50 | job.upload(__file__) 51 | job.upload('util.py') 52 | 53 | # kill python just for when tmux session reuse is on 54 | if not ncluster.running_locally(): 55 | job._run_raw('killall python', ignore_errors=True) 56 | 57 | if ncluster.get_backend() == 'aws': 58 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 59 | # TODO(y) switch to PyTorch enabled 60 | job.run('source activate tensorflow_p36') 61 | 62 | 63 | 64 | # TODO(y): this should be private ip 65 | hosts = [task.ip for task in job.tasks] 66 | host_str = ','.join(hosts) 67 | os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker') 68 | print(job.tasks[0].read('/tmp/out')) 69 | 70 | 71 | def run_worker(): 72 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 73 | 74 | from mpi4py import MPI 75 | comm = MPI.COMM_WORLD 76 | rank = comm.Get_rank() 77 | 78 | if rank == 0: 79 | log = util.FileLogger('/tmp/out') 80 | # log = util.FileLogger('/dev/null', mirror=False) 81 | 82 | else: 83 | log = util.FileLogger('/dev/null', mirror=False) 84 | grads_array = [] 85 | 86 | time_list = [] 87 | dim = args.size_mb*250*1000 88 | dtype = np.float32 89 | data = np.ones(dim, dtype=dtype)*(rank+1) 90 | for i in range(args.iters): 91 | start_time = time.perf_counter() 92 | if rank == 0: 93 | comm.Send(data, dest=1, tag=13) 94 | else: 95 | data = np.empty(dim, dtype=dtype) 96 | comm.Recv(data, source=0, tag=13) 97 | 98 | end_time = time.perf_counter() 99 | 100 | elapsed_time_ms = (end_time - start_time) * 1000 101 | time_list.append(elapsed_time_ms) 102 | rate = args.size_mb / (elapsed_time_ms / 1000) 103 | log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}' 104 | f' ms: {rate:.2f} MB/second') 105 | 106 | min = np.min(time_list) 107 | median = np.median(time_list) 108 | 109 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 110 | 111 | 112 | def main(): 113 | # run local benchmark in launcher and launch service 114 | if args.role == "launcher": 115 | run_launcher() 116 | elif args.role == "worker": 117 | run_worker() 118 | else: 119 | assert False, 'unknown role' 120 | 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /benchmarks/pytorch_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Run locally: 4 | # ./pytorch_p2p.py 5 | # 000/10 added 100 MBs in 35.0 ms: 2854.88 MB/second 6 | # 001/10 added 100 MBs in 25.1 ms: 3979.37 MB/second 7 | # 002/10 added 100 MBs in 25.4 ms: 3935.73 MB/second 8 | # 003/10 added 100 MBs in 24.7 ms: 4040.93 MB/second 9 | # 004/10 added 100 MBs in 24.4 ms: 4097.57 MB/second 10 | # min: 21.58, median: 24.97, mean: 25.61 11 | 12 | # To run on AWS: 13 | # export NCLUSTER_IMAGE='Deep Learning AMI (Ubuntu) Version 15.0' 14 | # export NCLUSTER_INSTANCE=c5.18xlarge 15 | # python pytorch_p2p.py --aws 16 | # 990/1000 added 100 MBs in 83.7 ms: 1194.35 MB/second 17 | # 991/1000 added 100 MBs in 83.4 ms: 1198.78 MB/second 18 | # 992/1000 added 100 MBs in 83.4 ms: 1198.73 MB/second 19 | # 993/1000 added 100 MBs in 83.3 ms: 1201.20 MB/second 20 | # 994/1000 added 100 MBs in 83.1 ms: 1203.84 MB/second 21 | # 995/1000 added 100 MBs in 83.1 ms: 1203.04 MB/second 22 | # 996/1000 added 100 MBs in 83.5 ms: 1197.38 MB/second 23 | # 997/1000 added 100 MBs in 82.4 ms: 1213.99 MB/second 24 | # 998/1000 added 100 MBs in 84.2 ms: 1187.69 MB/second 25 | # 999/1000 added 100 MBs in 83.0 ms: 1204.13 MB/second 26 | # min: 80.52, median: 83.25, mean: 83.29 27 | 28 | import os 29 | import sys 30 | import time 31 | import argparse 32 | import util 33 | 34 | parser = argparse.ArgumentParser(description='launch') 35 | 36 | # launcher flags 37 | parser.add_argument('--name', type=str, default='pytorch_two_machines', 38 | help="name of the current run") 39 | parser.add_argument('--size-mb', type=int, default=100, 40 | help='size of data to send') 41 | parser.add_argument('--iters', type=int, default=10, 42 | help='how many iterations') 43 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 44 | parser.add_argument('--image', 45 | default='Deep Learning AMI (Ubuntu) Version 15.0') 46 | 47 | 48 | # mpi flags 49 | parser.add_argument('--role', type=str, default='launcher', 50 | help='internal flag, launcher or worker') 51 | parser.add_argument('--rank', type=int, default=0, 52 | help='mpi rank') 53 | parser.add_argument('--size', type=int, default=0, 54 | help='size of mpi world') 55 | parser.add_argument('--master-addr', type=str, default='127.0.0.1', 56 | help='address of master node') 57 | parser.add_argument('--master-port', type=int, default=6006, 58 | help='port of master node') 59 | args = parser.parse_args() 60 | 61 | def worker(): 62 | """ Initialize the distributed environment. """ 63 | 64 | import torch 65 | import torch.distributed as dist 66 | from torch.multiprocessing import Process 67 | import numpy as np 68 | 69 | print("Initializing distributed pytorch") 70 | os.environ['MASTER_ADDR'] = str(args.master_addr) 71 | os.environ['MASTER_PORT'] = str(args.master_port) 72 | # Use TCP backend. Gloo needs nightly, where it currently fails with 73 | # dist.init_process_group('gloo', rank=args.rank, 74 | # AttributeError: module 'torch.distributed' has no attribute 'init_process_group' 75 | dist.init_process_group('tcp', rank=args.rank, 76 | world_size=args.size) 77 | 78 | tensor = torch.ones(args.size_mb*250*1000)*(args.rank+1) 79 | time_list = [] 80 | outfile = 'out' if args.rank == 0 else '/dev/null' 81 | log = util.FileLogger(outfile) 82 | for i in range(args.iters): 83 | # print('before: rank ', args.rank, ' has data ', tensor[0]) 84 | 85 | start_time = time.perf_counter() 86 | if args.rank == 0: 87 | dist.send(tensor=tensor, dst=1) 88 | else: 89 | dist.recv(tensor=tensor, src=0) 90 | 91 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 92 | time_list.append(elapsed_time_ms) 93 | # print('after: rank ', args.rank, ' has data ', tensor[0]) 94 | rate = args.size_mb/(elapsed_time_ms/1000) 95 | 96 | log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 97 | 98 | min = np.min(time_list) 99 | median = np.median(time_list) 100 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 101 | 102 | 103 | def launcher(): 104 | import ncluster 105 | 106 | if args.aws: 107 | ncluster.set_backend('aws') 108 | 109 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) 110 | job.upload(__file__) 111 | job.upload('util.py') 112 | 113 | if args.aws: 114 | job.run('source activate pytorch_p36') 115 | else: 116 | job.run('source deactivate') 117 | job.run('source activate ncluster-test3') 118 | 119 | script_name = os.path.basename(__file__) 120 | common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' 121 | job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args, 122 | non_blocking=True) 123 | job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args, 124 | non_blocking=True) 125 | 126 | job.tasks[0].join() 127 | print(job.tasks[0].read('out')) 128 | 129 | 130 | def main(): 131 | if args.role == "launcher": 132 | launcher() 133 | elif args.role == "worker": 134 | worker() 135 | else: 136 | assert False, "Unknown role "+FLAGS.role 137 | 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /benchmarks/ray_ps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Ray parameter server benchmark 4 | # 5 | # python ray_ps.py --aws --num-ps=1 --num-workers=1 --size-mb=100 --iters=100 6 | 7 | # # 1 worker, 1 ps 8 | # min: 61.61, median: 63.77, mean: 69.20 9 | 10 | # # 1 worker, 2 ps 11 | # python ray_ps.py --aws --num-ps=2 --num-workers=1 --size-mb=100 --iters=100 12 | # min: 49.45, median: 50.91, mean: 58.92 13 | 14 | # # 1 worker, 4 ps 15 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=100 --iters=100 16 | # min: 47.98, median: 50.71, mean: 59.05 17 | 18 | # # 4 worker, 4 ps 19 | # python ray_ps.py --aws --num-ps=4 --num-workers=4 --size-mb=100 --iters=100 20 | # 098/100 sent 400 MBs in 238.5 ms: 419.28 MB/second 21 | # 099/100 sent 400 MBs in 242.0 ms: 413.22 MB/second 22 | # min: 219.90, median: 241.51, mean: 245.95 23 | # (54ms per worker since 4x more work done) 24 | 25 | # # 1 worker, 4 ps, larger arrays 26 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=800 --iters=100 27 | # min: 358.35, median: 544.59, mean: 513.47 28 | # 29 | # Bottom line, 50-60ms to send 100MB regardless of sharding/workers 30 | 31 | import argparse 32 | import os 33 | import socket 34 | import subprocess 35 | import time 36 | 37 | import numpy as np 38 | import ray 39 | 40 | import util 41 | 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--role", default='launcher', type=str, 44 | help="launcher/driver") 45 | parser.add_argument('--image', 46 | default='Deep Learning AMI (Ubuntu) Version 15.0') 47 | parser.add_argument("--size-mb", default=10, type=int, 48 | help='how much data to send at each iteration') 49 | parser.add_argument("--num-workers", default=2, type=int) 50 | parser.add_argument("--num-ps", default=2, type=int) 51 | 52 | parser.add_argument("--iters", default=11, type=int) 53 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 54 | parser.add_argument("--xray", default=1, type=int, 55 | help="whether to use XRay backend") 56 | parser.add_argument('--nightly', default=1, type=int, 57 | help='whether to use nightly version') 58 | parser.add_argument('--name', default='ray_ps', type=str, 59 | help='name of the run') 60 | parser.add_argument("--ip", default='', type=str, 61 | help="internal flag, used to point worker to head node") 62 | args = parser.parse_args() 63 | 64 | dim = args.size_mb * 250 * 1000 // args.num_ps 65 | 66 | 67 | @ray.remote(resources={"worker": 1}) 68 | class Worker(object): 69 | def __init__(self): 70 | self.gradients = np.ones(dim, dtype=np.float32) 71 | 72 | @ray.method(num_return_vals=args.num_ps) 73 | def compute_gradients(self): 74 | if args.num_ps == 1: 75 | return self.gradients 76 | return [self.gradients]*args.num_ps 77 | 78 | def ip(self): 79 | return ray.services.get_node_ip_address() 80 | 81 | 82 | @ray.remote(resources={"worker": 1}) 83 | class ParameterServer(object): 84 | def __init__(self): 85 | self.params = np.zeros(dim, dtype=np.float32) 86 | 87 | def receive(self, *grad_list): 88 | for grad in grad_list: 89 | self.params = grad # use = just to get network overhead 90 | return self.params 91 | 92 | def get_weights(self): 93 | return self.params 94 | 95 | def ip(self): 96 | return ray.services.get_node_ip_address() 97 | 98 | 99 | 100 | def run_launcher(): 101 | import ncluster 102 | 103 | if args.aws: 104 | ncluster.set_backend('aws') 105 | 106 | if args.nightly: 107 | # running locally MacOS 108 | if 'Darwin' in util.ossystem('uname') and not args.aws: 109 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 110 | else: 111 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 112 | else: 113 | install_script = 'pip install ray' 114 | 115 | job = ncluster.make_job(name=args.name, 116 | install_script=install_script, 117 | image_name=args.image, 118 | num_tasks=args.num_workers+args.num_ps) 119 | if not ncluster.running_locally(): 120 | job._run_raw('killall python', ignore_errors=True) 121 | 122 | job.upload(__file__) 123 | job.upload('util.py') 124 | if args.xray: 125 | job.run('export RAY_USE_XRAY=1') 126 | job.run('ray stop') 127 | 128 | head = job.tasks[0] 129 | 130 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 131 | worker_resource = """--resources='{"worker": 1}'""" 132 | head.run(f"ray start --head {worker_resource} --redis-port=6379") 133 | 134 | for task in job.tasks[1:]: 135 | task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}") 136 | 137 | head.run(f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}') 138 | 139 | print(head.read('out')) 140 | 141 | 142 | def transpose(list_of_lists): 143 | return list(map(list, zip(*list_of_lists))) 144 | 145 | 146 | def run_driver(): 147 | ray.init(redis_address=args.ip) 148 | 149 | worker_actors = [Worker.remote() for _ in range(args.num_workers)] 150 | ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)] 151 | 152 | log = util.FileLogger('out') 153 | 154 | time_list = [] 155 | for i in range(args.iters): 156 | start_time = time.perf_counter() 157 | grads_list = [] 158 | for actor in worker_actors: 159 | result = actor.compute_gradients.remote() 160 | if args.num_ps == 1: 161 | grads_list.append([result]) 162 | else: 163 | grads_list.append(result) 164 | 165 | updates = [] 166 | for ps, shards in zip(ps_actors, transpose(grads_list)): 167 | updates.append(ps.receive.remote(*shards)) 168 | 169 | ray.wait(updates, num_returns=args.num_ps) 170 | 171 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 172 | time_list.append(elapsed_time_ms) 173 | rate = args.size_mb / (elapsed_time_ms/1000) 174 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb*args.num_workers, elapsed_time_ms, rate)) 175 | 176 | min = np.min(time_list) 177 | median = np.median(time_list) 178 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 179 | 180 | 181 | def main(): 182 | if args.role == 'launcher': 183 | run_launcher() 184 | elif args.role == 'driver': 185 | run_driver() 186 | else: 187 | assert False, f"Unknown role {args.role}, must be laucher/driver" 188 | 189 | 190 | if __name__ == '__main__': 191 | main() 192 | -------------------------------------------------------------------------------- /benchmarks/ray_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Example of two process Ray program, worker sends values to parameter 4 | # server on a different machine 5 | # 6 | # Run locally: 7 | # ./ray_two_machines.py 8 | # 9 | # Run on AWS: 10 | # ./ray_two_machines.py --aws 11 | 12 | 13 | # Example timings 14 | # c5.18xlarge over network: over network: 63.0 ms: 1586.76 MB/second 15 | # c5.9xlarge over network: 399/400 added 100 MBs in 85.5 ms: 1170.26 MB/second 16 | # c5.18xlarge locally: 86 ms, 1218 MB/seconds (9.7 Gbps) 17 | # macbook pro locally: 978.9 ms, 102.15 MB/second 18 | 19 | # c5.18xlarge 20 | # 004/11 sent 100 MBs in 69.4 ms: 1440.31 MB/second 21 | # 005/11 sent 100 MBs in 68.1 ms: 1468.95 MB/second 22 | # 006/11 sent 100 MBs in 70.4 ms: 1421.40 MB/second 23 | # 007/11 sent 100 MBs in 69.5 ms: 1438.62 MB/second 24 | # 008/11 sent 100 MBs in 66.4 ms: 1506.90 MB/second 25 | # 009/11 sent 100 MBs in 76.5 ms: 1306.92 MB/second 26 | # 010/11 sent 100 MBs in 66.8 ms: 1497.64 MB/second 27 | # min: 66.36, median: 69.43, mean: 70.55 28 | 29 | # Another run 30 | # 989/1000 sent 100 MBs in 54.6 ms: 1831.07 MB/second 31 | # 990/1000 sent 100 MBs in 54.4 ms: 1837.20 MB/second 32 | # 991/1000 sent 100 MBs in 54.8 ms: 1824.91 MB/second 33 | # 992/1000 sent 100 MBs in 53.4 ms: 1874.39 MB/second 34 | # 993/1000 sent 100 MBs in 53.1 ms: 1881.77 MB/second 35 | # 994/1000 sent 100 MBs in 52.7 ms: 1897.76 MB/second 36 | # 995/1000 sent 100 MBs in 55.4 ms: 1805.42 MB/second 37 | # 996/1000 sent 100 MBs in 53.4 ms: 1872.93 MB/second 38 | # 997/1000 sent 100 MBs in 52.7 ms: 1896.65 MB/second 39 | # 998/1000 sent 100 MBs in 54.0 ms: 1851.14 MB/second 40 | # 999/1000 sent 100 MBs in 53.6 ms: 1864.93 MB/second 41 | # min: 51.11, median: 55.45, mean: 60.74 42 | 43 | 44 | # Bottom line: 30ms locally, 60ms over network 45 | 46 | import argparse 47 | import os 48 | import socket 49 | import subprocess 50 | import time 51 | 52 | import numpy as np 53 | import ray 54 | 55 | import util 56 | 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--role", default='launcher', type=str, 59 | help="launcher/driver") 60 | parser.add_argument('--image', 61 | default='Deep Learning AMI (Ubuntu) Version 15.0') 62 | parser.add_argument("--size-mb", default=100, type=int, 63 | help='how much data to send at each iteration') 64 | parser.add_argument("--iters", default=11, type=int) 65 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 66 | parser.add_argument("--xray", default=1, type=int, 67 | help="whether to use XRay backend") 68 | parser.add_argument('--nightly', default=1, type=int, 69 | help='whether to use nightly version') 70 | parser.add_argument('--name', default='ray_two_machines', type=str, 71 | help='name of the run') 72 | parser.add_argument("--ip", default='', type=str, 73 | help="internal flag, used to point worker to head node") 74 | args = parser.parse_args() 75 | 76 | dim = args.size_mb * 250 * 1000 77 | 78 | 79 | @ray.remote(resources={"worker": 1}) 80 | class Worker(object): 81 | def __init__(self): 82 | self.gradients = np.ones(dim, dtype=np.float32) 83 | 84 | def compute_gradients(self): 85 | return self.gradients 86 | 87 | def ip(self): 88 | return ray.services.get_node_ip_address() 89 | 90 | 91 | @ray.remote(resources={"ps": 1}) 92 | class ParameterServer(object): 93 | def __init__(self): 94 | self.params = np.zeros(dim, dtype=np.float32) 95 | 96 | def receive(self, grad): 97 | self.params = grad # use = just to get network overhead 98 | return self.params 99 | 100 | def get_weights(self): 101 | return self.params 102 | 103 | def ip(self): 104 | return ray.services.get_node_ip_address() 105 | 106 | 107 | 108 | def run_launcher(): 109 | import ncluster 110 | 111 | if args.aws: 112 | ncluster.set_backend('aws') 113 | 114 | if args.nightly: 115 | # running locally MacOS 116 | print(f"asdfasdf {util.ossystem('uname')}") 117 | if 'Darwin' in util.ossystem('uname') and not args.aws: 118 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 119 | print(f"asdfasdf got install script {install_script}") 120 | else: 121 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 122 | else: 123 | install_script = 'pip install ray' 124 | 125 | job = ncluster.make_job(name=args.name, 126 | install_script=install_script, 127 | image_name=args.image, 128 | num_tasks=2) 129 | ps, worker = job.tasks 130 | if not ncluster.running_locally(): 131 | ps._run_raw('killall python', ignore_errors=True) 132 | worker._run_raw('killall python', ignore_errors=True) 133 | 134 | job.upload(__file__) 135 | job.upload('util.py') 136 | if args.xray: 137 | job.run('export RAY_USE_XRAY=1') 138 | job.run('ray stop') 139 | 140 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 141 | ps_resource = """--resources='{"ps": 1}'""" 142 | worker_resource = """--resources='{"worker": 1}'""" 143 | 144 | ps.run(f"ray start --head {ps_resource} --redis-port=6379") 145 | worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") 146 | worker.run( 147 | f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 148 | print(worker.read('out')) 149 | 150 | 151 | def run_driver(): 152 | ray.init(redis_address=args.ip) 153 | 154 | worker = Worker.remote() 155 | ps = ParameterServer.remote() 156 | log = util.FileLogger('out') 157 | log(f"Worker ip {ray.get(worker.ip.remote())}") 158 | log(f"PS ip {ray.get(ps.ip.remote())}") 159 | log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") 160 | 161 | time_list = [] 162 | for i in range(args.iters): 163 | start_time = time.perf_counter() 164 | grads = worker.compute_gradients.remote() 165 | result = ps.receive.remote(grads) 166 | ray.wait([result]) 167 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 168 | time_list.append(elapsed_time_ms) 169 | rate = args.size_mb / (elapsed_time_ms/1000) 170 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 171 | 172 | min = np.min(time_list) 173 | median = np.median(time_list) 174 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 175 | 176 | 177 | def main(): 178 | if args.role == 'launcher': 179 | run_launcher() 180 | elif args.role == 'driver': 181 | run_driver() 182 | else: 183 | assert False, f"Unknown role {args.role}, must be laucher/driver" 184 | 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /benchmarks/ray_two_machines_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Runs two machine benchmark locally on AWS machine 4 | # 5 | # Example timings 6 | # macbook: added 10 MBs in 14.1 ms: 707.68 MB/second 7 | # c5.18xlarge: added 10 MBs in 4.4 ms: 2298.82 MB/second 8 | # 091/100 added 100 MBs in 30.8 ms: 3246.44 MB/second 9 | 10 | # Bottom line: can do 3.2 GB/second running locally, 800 11 | import argparse 12 | import os 13 | import socket 14 | import subprocess 15 | import time 16 | 17 | import numpy as np 18 | import ray 19 | 20 | import util 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--image', 24 | default='Deep Learning AMI (Ubuntu) Version 15.0') 25 | parser.add_argument("--size-mb", default=100, type=int, 26 | help='how much data to send at each iteration') 27 | parser.add_argument("--iters", default=11, type=int) 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 29 | parser.add_argument("--xray", default=1, type=int, 30 | help="whether to use XRay backend") 31 | parser.add_argument('--nightly', default=1, type=int, 32 | help='whether to use nightly version') 33 | parser.add_argument('--name', default='ray_two_machines', type=str, 34 | help='name of the run') 35 | 36 | parser.add_argument("--ip", default='', type=str, 37 | help="internal flag, used to point worker to head node") 38 | parser.add_argument("--role", default='launcher', type=str, 39 | help="interanl flag, launcher/driver") 40 | args = parser.parse_args() 41 | 42 | dim = args.size_mb * 250 * 1000 43 | 44 | 45 | @ray.remote(resources={"worker": 1}) 46 | class Worker(object): 47 | def __init__(self): 48 | self.gradients = np.ones(dim, dtype=np.float32) 49 | 50 | def compute_gradients(self): 51 | return self.gradients 52 | 53 | def ip(self): 54 | return ray.services.get_node_ip_address() 55 | 56 | 57 | @ray.remote(resources={"ps": 1}) 58 | class ParameterServer(object): 59 | def __init__(self): 60 | self.params = np.zeros(dim, dtype=np.float32) 61 | 62 | def assign_add(self, grad): 63 | self.params = grad # use = just to get network overhead 64 | return self.params 65 | 66 | def get_weights(self): 67 | return self.params 68 | 69 | def ip(self): 70 | return ray.services.get_node_ip_address() 71 | 72 | 73 | 74 | def run_launcher(): 75 | import ncluster 76 | 77 | if args.aws: 78 | ncluster.set_backend('aws') 79 | 80 | if args.nightly: 81 | # running locally MacOS 82 | if 'Darwin' in util.ossystem('uname') and not args.aws: 83 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 84 | else: 85 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 86 | else: 87 | install_script = 'pip install ray' 88 | 89 | worker = ncluster.make_task(name=args.name, 90 | install_script=install_script, 91 | image_name=args.image) 92 | if not ncluster.running_locally(): 93 | worker._run_raw('killall python', ignore_errors=True) 94 | worker.upload(__file__) 95 | worker.upload('util.py') 96 | if args.xray: 97 | worker.run('export RAY_USE_XRAY=1') 98 | worker.run('ray stop') 99 | 100 | resources = """--resources='{"ps": 1, "worker": 1}'""" 101 | worker.run(f"ray start --head {resources} --redis-port=6379") 102 | # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") 103 | worker.run( 104 | f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 105 | print(worker.read('out')) 106 | 107 | 108 | def run_driver(): 109 | ray.init(redis_address=args.ip) 110 | 111 | worker = Worker.remote() 112 | ps = ParameterServer.remote() 113 | log = util.FileLogger('out') 114 | log(f"Worker ip {ray.get(worker.ip.remote())}") 115 | log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") 116 | 117 | time_list = [] 118 | for i in range(args.iters): 119 | start_time = time.perf_counter() 120 | grads = worker.compute_gradients.remote() 121 | result = ps.assign_add.remote(grads) 122 | result = ray.get(result)[0] 123 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 124 | time_list.append(elapsed_time_ms) 125 | rate = args.size_mb / (elapsed_time_ms/1000) 126 | log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 127 | 128 | min = np.min(time_list) 129 | median = np.median(time_list) 130 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 131 | 132 | 133 | def main(): 134 | if args.role == 'launcher': 135 | run_launcher() 136 | elif args.role == 'driver': 137 | run_driver() 138 | else: 139 | assert False, f"Unknown role {args.role}, must be laucher/driver" 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | -------------------------------------------------------------------------------- /benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | ray 2 | torch 3 | tensorflow 4 | -------------------------------------------------------------------------------- /benchmarks/summary.txt: -------------------------------------------------------------------------------- 1 | tf_two_machines -- 500 on t3, 910 on c3 2 | 3 | Ray can do 4 | 30ms on local transfers, 60ms on AWS c5.18xlarge 5 | Using multiple ps shards, can do 48ms on AWS 6 | 7 | 8 | 40ms on unoptimized PyTorch clone 9 | 2.7ms for optimized memcpy on skylake: 300 Gbps (37 GB/second, close to memory bandwidth) -- https://www.google.com/url?q=https://www.anandtech.com/show/11544/intel-skylake-ep-vs-amd-epyc-7000-cpu-battle-of-the-decade/12&source=gmail&ust=1537921524487000&usg=AFQjCNGUrAScjR_rAihauUr-nj5TMg-VKQ 10 | 11 | 12 | PyTorch backend can do 20 Gbps per thread on 13 | -------------------------------------------------------------------------------- /benchmarks/tf_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver. 6 | 7 | To run locally: 8 | ./tf_two_machines.py 9 | Should see something like this 10 | 11 | ``` 12 | 005/11 added 100 MBs in 78.9 ms: 1266.98 MB/second 13 | 006/11 added 100 MBs in 78.1 ms: 1280.07 MB/second 14 | 007/11 added 100 MBs in 78.1 ms: 1280.56 MB/second 15 | 008/11 added 100 MBs in 81.8 ms: 1222.76 MB/second 16 | 009/11 added 100 MBs in 79.5 ms: 1258.54 MB/second 17 | 010/11 added 100 MBs in 76.6 ms: 1305.64 MB/second 18 | min: 76.59, median: 78.80, mean: 88.34 19 | ``` 20 | 21 | To interact with task 1 (the driver), do "tmux a -t 1" 22 | 23 | To run on AWS 24 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 25 | ./tf_two_machines.py --aws 26 | 27 | Should see something like this with t3.large instances 28 | ``` 29 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second 30 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second 31 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second 32 | ``` 33 | 34 | Running c5.18xlarge machines with more iterations 35 | 007/11 sent 100 MBs in 135.4 ms: 738.47 MB/second 36 | 008/11 sent 100 MBs in 133.0 ms: 752.04 MB/second 37 | 009/11 sent 100 MBs in 133.8 ms: 747.48 MB/second 38 | 010/11 sent 100 MBs in 136.3 ms: 733.77 MB/second 39 | min: 132.97, median: 134.98, mean: 137.27 40 | 41 | 42 | Can use more shards 43 | ./tf_two_machines.py --aws --shards=8 --iters=1000 44 | 994/1000 sent 100 MBs in 87.0 ms: 1149.50 MB/second 45 | 995/1000 sent 100 MBs in 87.0 ms: 1149.21 MB/second 46 | 996/1000 sent 100 MBs in 86.8 ms: 1152.11 MB/second 47 | 997/1000 sent 100 MBs in 89.8 ms: 1113.89 MB/second 48 | 998/1000 sent 100 MBs in 87.9 ms: 1137.37 MB/second 49 | 999/1000 sent 100 MBs in 88.0 ms: 1135.80 MB/second 50 | min: 86.12, median: 88.48, mean: 89.51 51 | 52 | 53 | To connect and interact with the job look for SSH instructions like this 54 | To connect to 0.tf_two_machines 55 | ssh -i /Users/yaroslav/.ncluster/ncluster2-yaroslav-316880547378-us-east-1.pem -o StrictHostKeyChecking=no ubuntu@18.234.30.222 56 | 57 | ssh into the instance following these instructions, then run "tmux a" 58 | 59 | 60 | """ 61 | 62 | import argparse 63 | import json 64 | import os 65 | import numpy as np 66 | import tensorflow as tf 67 | import time 68 | 69 | import util 70 | 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 73 | parser.add_argument("--iters", default=11, type=int, 74 | help="Maximum number of additions") 75 | parser.add_argument("--size-mb", default=100, type=int, 76 | help="size of vector in MBs") 77 | parser.add_argument("--shards", default=1, type=int, 78 | help="how many ways to shard the variable") 79 | parser.add_argument('--image', 80 | default='Deep Learning AMI (Ubuntu) Version 15.0') 81 | parser.add_argument('--name', 82 | default='tf_two_machines') 83 | 84 | # internal flags 85 | parser.add_argument('--role', default='launcher', type=str) 86 | parser.add_argument("--sender-ip", default='127.0.0.1') 87 | parser.add_argument("--receiver-ip", default='127.0.0.1') 88 | args = parser.parse_args() 89 | 90 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 91 | 'receiver': [args.receiver_ip + ':32301']} 92 | 93 | 94 | def _launch_server(role): 95 | os.environ['TF_CONFIG'] = json.dumps( 96 | {'cluster': cluster_spec, 97 | 'task': {'type': role, 'index': 0}}) 98 | config = tf.estimator.RunConfig() 99 | return tf.train.Server(config.cluster_spec, 100 | job_name=config.task_type, 101 | task_index=config.task_id) 102 | 103 | 104 | def run_launcher(): 105 | import ncluster 106 | if args.aws: 107 | ncluster.set_backend('aws') 108 | 109 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) 110 | job.upload(__file__) 111 | job.upload('util.py') 112 | 113 | sender, receiver = job.tasks 114 | # kill python just for when tmux session reuse is on 115 | if not ncluster.running_locally(): 116 | sender._run_raw('killall python', ignore_errors=True) 117 | receiver._run_raw('killall python', ignore_errors=True) 118 | 119 | if ncluster.get_backend() == 'aws': 120 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 121 | job.run('source activate tensorflow_p36') 122 | 123 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 124 | receiver.run(f'python {__file__} --role=receiver {ip_config}', 125 | non_blocking=True) 126 | sender.run( 127 | f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') 128 | print(sender.read('out')) 129 | 130 | 131 | def run_receiver(): 132 | server = _launch_server('receiver') 133 | time.sleep(365 * 24 * 3600) 134 | del server 135 | 136 | 137 | def run_sender(): 138 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 139 | log = util.FileLogger('out') 140 | grads_array = [] 141 | with tf.device('/job:chief/task:0'): 142 | # grads = tf.fill([param_size], 1.) 143 | for i in range(args.shards): 144 | grads = tf.Variable(tf.ones([param_size])) 145 | grads_array.append(grads) 146 | 147 | params_array = [] 148 | add_op_array = [] 149 | with tf.device('/job:receiver/task:0'): 150 | for i in range(args.shards): 151 | params = tf.Variable(tf.ones([param_size])) 152 | add_op = params.assign(grads_array[i]).op 153 | params_array.append(params) 154 | add_op_array.append(add_op) 155 | add_op = tf.group(*add_op_array) 156 | 157 | server = _launch_server('chief') 158 | sess = tf.Session(server.target) 159 | sess.run(tf.global_variables_initializer()) 160 | # except Exception as e: 161 | # # sometimes .run fails with .UnavailableError: OS Error 162 | # log(f"initialization failed with {e}, retrying in 1 second") 163 | # time.sleep(1) 164 | 165 | time_list = [] 166 | for i in range(args.iters): 167 | start_time = time.perf_counter() 168 | sess.run(add_op) 169 | elapsed_time_ms = (time.perf_counter() - start_time) * 1000 170 | time_list.append(elapsed_time_ms) 171 | rate = args.size_mb / (elapsed_time_ms / 1000) 172 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % ( 173 | i, args.iters, args.size_mb, elapsed_time_ms, rate)) 174 | 175 | min = np.min(time_list) 176 | median = np.median(time_list) 177 | 178 | log( 179 | f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 180 | 181 | 182 | def main(): 183 | # run local benchmark in launcher and launch service 184 | if args.role == "launcher": 185 | run_launcher() 186 | elif args.role == "sender": 187 | run_sender() 188 | elif args.role == "receiver": 189 | run_receiver() 190 | else: 191 | assert False, 'unknown role' 192 | 193 | 194 | if __name__ == '__main__': 195 | main() 196 | -------------------------------------------------------------------------------- /benchmarks/tf_two_machines_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Runs distributed benchmark on a single machine remotely 4 | 5 | Adding 100MB buffers 6 | 7 | # 1 shard: 88ms 8 | # 4 shards: 56ms 9 | # 8 shards: 51ms 10 | # 16 shards: 55ms 11 | 12 | # increase size 8x 13 | python tf_two_machines_local.py --shards=8 --iters=100 --size-mb=800 --aws 14 | # 416ms 15 | 16 | Bottom line: 1.6-1.9 GB/second when running locally 17 | Going 1->4 shards saves 30%, 4->8 shards another 5% 18 | 19 | i3.metal 30% slower than c5.18xlarge 20 | 21 | """ 22 | 23 | import argparse 24 | import json 25 | import os 26 | import numpy as np 27 | import tensorflow as tf 28 | import time 29 | 30 | import util 31 | 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 34 | parser.add_argument("--iters", default=11, type=int, 35 | help="Maximum number of additions") 36 | parser.add_argument("--size-mb", default=100, type=int, 37 | help="size of vector in MBs") 38 | parser.add_argument("--shards", default=1, type=int, 39 | help="how many ways to shard the variable") 40 | parser.add_argument('--image', 41 | default='Deep Learning AMI (Ubuntu) Version 15.0') 42 | parser.add_argument('--name', 43 | default='tf_two_machines_local') 44 | 45 | # internal flags 46 | parser.add_argument('--role', default='launcher', type=str) 47 | parser.add_argument("--sender-ip", default='127.0.0.1') 48 | parser.add_argument("--receiver-ip", default='127.0.0.1') 49 | args = parser.parse_args() 50 | 51 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 52 | 'receiver': [args.receiver_ip + ':32301']} 53 | 54 | 55 | def _launch_server(role): 56 | os.environ['TF_CONFIG'] = json.dumps( 57 | {'cluster': cluster_spec, 58 | 'task': {'type': role, 'index': 0}}) 59 | config = tf.estimator.RunConfig() 60 | return tf.train.Server(config.cluster_spec, 61 | job_name=config.task_type, 62 | task_index=config.task_id) 63 | 64 | 65 | def run_launcher(): 66 | import ncluster 67 | ncluster.util.assert_script_in_current_directory() 68 | 69 | if args.aws: 70 | ncluster.set_backend('aws') 71 | 72 | # use 4GB instance, 0.5GB not enough 73 | worker = ncluster.make_task(args.name, image_name=args.image, 74 | instance_type='t3.medium') 75 | worker.upload(__file__) 76 | worker.upload('util.py') 77 | 78 | # kill python just for when tmux session reuse is on 79 | if not ncluster.running_locally(): 80 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 81 | worker._run_raw('killall python', ignore_errors=True) 82 | worker.run('source activate tensorflow_p36') 83 | 84 | ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' 85 | worker.run(f'python {__file__} --role=receiver {ip_config}', 86 | non_blocking=True) 87 | worker.switch_window(1) # run in new tmux window 88 | if not ncluster.running_locally(): 89 | worker.run('source activate tensorflow_p36') 90 | worker.run( 91 | f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') 92 | print(worker.read('out')) 93 | 94 | 95 | def run_receiver(): 96 | server = _launch_server('receiver') 97 | time.sleep(365 * 24 * 3600) 98 | del server 99 | 100 | 101 | def run_sender(): 102 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 103 | log = util.FileLogger('out') 104 | grads_array = [] 105 | with tf.device('/job:chief/task:0'): 106 | # grads = tf.fill([param_size], 1.) 107 | for i in range(args.shards): 108 | grads = tf.Variable(tf.ones([param_size])) 109 | grads_array.append(grads) 110 | 111 | params_array = [] 112 | add_op_array = [] 113 | with tf.device('/job:receiver/task:0'): 114 | for i in range(args.shards): 115 | params = tf.Variable(tf.ones([param_size])) 116 | add_op = params.assign(grads_array[i]).op 117 | params_array.append(params) 118 | add_op_array.append(add_op) 119 | add_op = tf.group(*add_op_array) 120 | 121 | server = _launch_server('chief') 122 | sess = tf.Session(server.target) 123 | sess.run(tf.global_variables_initializer()) 124 | # except Exception as e: 125 | # # sometimes .run fails with .UnavailableError: OS Error 126 | # log(f"initialization failed with {e}, retrying in 1 second") 127 | # time.sleep(1) 128 | 129 | time_list = [] 130 | for i in range(args.iters): 131 | start_time = time.perf_counter() 132 | sess.run(add_op) 133 | elapsed_time_ms = (time.perf_counter() - start_time) * 1000 134 | time_list.append(elapsed_time_ms) 135 | rate = args.size_mb / (elapsed_time_ms / 1000) 136 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % ( 137 | i, args.iters, args.size_mb, elapsed_time_ms, rate)) 138 | 139 | min = np.min(time_list) 140 | median = np.median(time_list) 141 | 142 | log( 143 | f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 144 | 145 | 146 | def main(): 147 | # run local benchmark in launcher and launch service 148 | if args.role == "launcher": 149 | run_launcher() 150 | elif args.role == "sender": 151 | run_sender() 152 | elif args.role == "receiver": 153 | run_receiver() 154 | else: 155 | assert False, 'unknown role' 156 | 157 | 158 | if __name__ == '__main__': 159 | main() 160 | -------------------------------------------------------------------------------- /benchmarks/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | 6 | class FileLogger: 7 | """Helper class to log to file (possibly mirroring to stderr) 8 | logger = FileLogger('somefile.txt') 9 | logger = FileLogger('somefile.txt', mirror=True) 10 | logger('somemessage') 11 | logger('somemessage: %s %.2f', 'value', 2.5) 12 | """ 13 | 14 | def __init__(self, fn, mirror=True): 15 | self.fn = fn 16 | self.f = open(fn, 'w') 17 | self.mirror = mirror 18 | print(f"Creating FileLogger on {os.path.abspath(fn)}") 19 | 20 | def __call__(self, s='', *args): 21 | """Either ('asdf %f', 5) or (val1, val2, val3, ...)""" 22 | if (isinstance(s, str) or isinstance(s, bytes)) and '%' in s: 23 | formatted_s = s % args 24 | else: 25 | toks = [s] + list(args) 26 | formatted_s = ', '.join(str(s) for s in toks) 27 | 28 | self.f.write(formatted_s + '\n') 29 | self.f.flush() 30 | if self.mirror: 31 | # use manual flushing because "|" makes output 4k buffered instead of 32 | # line-buffered 33 | sys.stdout.write(formatted_s+'\n') 34 | sys.stdout.flush() 35 | 36 | def __del__(self): 37 | self.f.close() 38 | 39 | 40 | def ossystem(cmd): 41 | """Like os.system, but returns output of command as string.""" 42 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, 43 | stderr=subprocess.STDOUT) 44 | (stdout, stderr) = p.communicate() 45 | return stdout.decode('ascii') if stdout else '' 46 | -------------------------------------------------------------------------------- /examples/deleteme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | print(sys.argv[0]) 4 | -------------------------------------------------------------------------------- /examples/gpubox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Launch a single GPU instance with jupyter notebook 4 | 5 | import argparse 6 | import os 7 | import ncluster 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--name', type=str, default='gpubox', 11 | help="instance name") 12 | parser.add_argument('--image-name', type=str, 13 | default='Deep Learning AMI (Ubuntu) Version 23.0', 14 | help="name of AMI to use ") 15 | parser.add_argument('--instance-type', type=str, default='p3.2xlarge', 16 | help="type of instance") 17 | parser.add_argument('--password', 18 | default='DefaultNotebookPasswordPleaseChange', 19 | help='password to use for jupyter notebook') 20 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 21 | 22 | args = parser.parse_args() 23 | module_path = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | def main(): 26 | task = ncluster.make_task(name=args.name, 27 | instance_type=args.instance_type, 28 | image_name=args.image_name) 29 | 30 | # upload notebook config with provided password 31 | jupyter_config_fn = _create_jupyter_config(args.password) 32 | remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' 33 | task.upload(jupyter_config_fn, remote_config_fn) 34 | 35 | # upload sample notebook and start Jupyter server 36 | task.run('mkdir -p /ncluster/notebooks') 37 | task.upload(f'{module_path}/gpubox_sample.ipynb', 38 | '/ncluster/notebooks/gpubox_sample.ipynb', 39 | dont_overwrite=True) 40 | task.run('cd /ncluster/notebooks') 41 | task.run('jupyter notebook', non_blocking=True) 42 | print(f'Jupyter notebook will be at http://{task.public_ip}:8888') 43 | 44 | 45 | def _create_jupyter_config(password): 46 | from notebook.auth import passwd 47 | sha = passwd(args.password) 48 | local_config_fn = f'{module_path}/gpubox_jupyter_notebook_config.py' 49 | temp_config_fn = '/tmp/' + os.path.basename(local_config_fn) 50 | os.system(f'cp {local_config_fn} {temp_config_fn}') 51 | _replace_lines(temp_config_fn, 'c.NotebookApp.password', 52 | f"c.NotebookApp.password = '{sha}'") 53 | return temp_config_fn 54 | 55 | 56 | def _replace_lines(fn, startswith, new_line): 57 | """Replace lines starting with starts_with in fn with new_line.""" 58 | new_lines = [] 59 | for line in open(fn): 60 | if line.startswith(startswith): 61 | new_lines.append(new_line) 62 | else: 63 | new_lines.append(line) 64 | with open(fn, 'w') as f: 65 | f.write('\n'.join(new_lines)) 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /examples/gpubox_sample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Mon Aug 13 23:41:40 2018 \r\n", 13 | "+-----------------------------------------------------------------------------+\r\n", 14 | "| NVIDIA-SMI 396.37 Driver Version: 396.37 |\r\n", 15 | "|-------------------------------+----------------------+----------------------+\r\n", 16 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", 17 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", 18 | "|===============================+======================+======================|\r\n", 19 | "| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 |\r\n", 20 | "| N/A 43C P8 14W / 150W | 0MiB / 7618MiB | 0% Default |\r\n", 21 | "+-------------------------------+----------------------+----------------------+\r\n", 22 | " \r\n", 23 | "+-----------------------------------------------------------------------------+\r\n", 24 | "| Processes: GPU Memory |\r\n", 25 | "| GPU PID Type Process name Usage |\r\n", 26 | "|=============================================================================|\r\n", 27 | "| No running processes found |\r\n", 28 | "+-----------------------------------------------------------------------------+\r\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "!nvidia-smi" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [default]", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.6.4" 61 | }, 62 | "toc": { 63 | "colors": { 64 | "hover_highlight": "#DAA520", 65 | "running_highlight": "#FF0000", 66 | "selected_highlight": "#FFD700" 67 | }, 68 | "moveMenuLeft": true, 69 | "nav_menu": { 70 | "height": "12px", 71 | "width": "252px" 72 | }, 73 | "navigate_menu": true, 74 | "number_sections": true, 75 | "sideBar": true, 76 | "threshold": 4, 77 | "toc_cell": false, 78 | "toc_section_display": "block", 79 | "toc_window_display": false 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /examples/launch_16_instances.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import time 3 | 4 | def main(): 5 | ncluster.set_backend('aws') 6 | 7 | start_time = time.time() 8 | job = ncluster.make_job(num_tasks=16) 9 | print(f"waited for startup for {time.time()-start_time} seconds") 10 | 11 | start_time = time.time() 12 | job.run('sleep 10') 13 | print(f"waited for exec for {time.time()-start_time} seconds") 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /examples/ray_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Example of two process Ray program, worker sends values to parameter 4 | # server on a different machine 5 | # 6 | # Run locally: 7 | # ./ray_example.py 8 | # 9 | # Run on AWS: 10 | # ./ray_example.py --aws 11 | 12 | import argparse 13 | import os 14 | import time 15 | 16 | import numpy as np 17 | import ray 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--role", default='launcher', type=str, 21 | help="launcher/driver") 22 | parser.add_argument('--image', default='Deep Learning AMI (Ubuntu) Version 13.0') 23 | parser.add_argument("--size-mb", default=10, type=int, help='how much data to send at each iteration') 24 | parser.add_argument("--iters", default=10, type=int) 25 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 26 | parser.add_argument("--ip", default='', type=str, 27 | help="internal flag, used to point worker to head node") 28 | 29 | args = parser.parse_args() 30 | 31 | dim = args.size_mb * 250 * 1000 32 | 33 | 34 | @ray.remote(resources={"worker": 1}) 35 | class Worker(object): 36 | def __init__(self): 37 | self.gradients = np.ones(dim, dtype=np.float32) 38 | 39 | def compute_gradients(self): 40 | return self.gradients 41 | 42 | 43 | @ray.remote(resources={"ps": 1}) 44 | class ParameterServer(object): 45 | def __init__(self): 46 | self.params = np.zeros(dim, dtype=np.float32) 47 | 48 | def assign_add(self, grad): 49 | self.params += grad 50 | return self.params 51 | 52 | def get_weights(self): 53 | return self.params 54 | 55 | 56 | def run_launcher(): 57 | import ncluster 58 | 59 | if args.aws: 60 | ncluster.set_backend('aws') 61 | 62 | script = os.path.basename(__file__) 63 | assert script in os.listdir('.') 64 | job = ncluster.make_job(install_script='pip install ray', 65 | image_name=args.image, 66 | instance_type='c5.large', 67 | num_tasks=2) 68 | job.upload(script) 69 | job.run('export RAY_USE_XRAY=1') 70 | job.run('ray stop') 71 | 72 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 73 | ps_resource = """--resources='{"ps": 1}'""" 74 | worker_resource = """--resources='{"worker": 1}'""" 75 | ps, worker = job.tasks 76 | ps.run(f"ray start --head {ps_resource} --redis-port=6379") 77 | worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") 78 | worker.run(f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 79 | 80 | 81 | def run_driver(): 82 | ray.init(redis_address=args.ip) 83 | 84 | worker = Worker.remote() 85 | ps = ParameterServer.remote() 86 | 87 | for iteration in range(args.iters): 88 | start_time = time.time() 89 | grads = worker.compute_gradients.remote() 90 | result = ps.assign_add.remote(grads) 91 | result = ray.get(result)[0] 92 | elapsed_time = time.time() - start_time 93 | rate = args.size_mb / elapsed_time 94 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (result, args.iters, args.size_mb, elapsed_time * 1000, rate)) 95 | 96 | 97 | def main(): 98 | if args.role == 'launcher': 99 | run_launcher() 100 | elif args.role == 'driver': 101 | run_driver() 102 | else: 103 | assert False, f"Unknown role {args.role}, must be laucher/driver" 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter # for notebook.auth.passwd 2 | tensorflow 3 | torch 4 | ray 5 | -------------------------------------------------------------------------------- /examples/simple_job.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import time 3 | 4 | def main(): 5 | ncluster.set_backend('local') 6 | 7 | job = ncluster.make_job(num_tasks=2) 8 | 9 | start_time = time.time() 10 | job.run('sleep 1') 11 | print(f"waited for {time.time()-start_time} seconds") 12 | 13 | if __name__ == '__main__': 14 | main() 15 | -------------------------------------------------------------------------------- /examples/simple_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import ncluster 3 | 4 | # allocate default machine type and default image 5 | task = ncluster.make_task() 6 | output = task.run('ifconfig') 7 | print(f"Task ifconfig returned {output}") 8 | -------------------------------------------------------------------------------- /examples/simple_tf.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | 4 | if not sys.argv[1:]: 5 | import ncluster 6 | task = ncluster.make_task(instance_type='t3.micro') 7 | task.upload(__file__) 8 | task.run('pip install tensorflow') 9 | task.run(f'python {__file__} worker') 10 | elif sys.argv[1] == 'worker': 11 | import tensorflow as tf 12 | import os 13 | sess = tf.Session() 14 | ones = tf.ones((1000,1000)) 15 | result = sess.run(tf.matmul(ones, ones)) 16 | print(f"matmul gave {result.sum()}") 17 | os.system('sudo shutdown -h -P 10') # shut down the instance in 10 mins 18 | 19 | -------------------------------------------------------------------------------- /examples/tf_adder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver. 6 | 7 | To run locally: 8 | ./tf_adder.py 9 | tmux a -t 0 10 | 11 | Should see something like this 12 | ``` 13 | 089/100 added 100 MBs in 114.9 ms: 1114.36 MB/second 14 | 090/100 added 100 MBs in 113.4 ms: 1128.61 MB/second 15 | 091/100 added 100 MBs in 113.4 ms: 1128.60 MB/second 16 | ``` 17 | 18 | 19 | To run on AWS 20 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 21 | ./tf_adder.py --aws 22 | nconnect 0.tf_adder 23 | 24 | Should see something like this with t3.large instances 25 | ``` 26 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second 27 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second 28 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second 29 | ``` 30 | 31 | """ 32 | 33 | import argparse 34 | import json 35 | import os 36 | import tensorflow as tf 37 | import time 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 41 | parser.add_argument("--iters", default=20, type=int, help="Maximum number of additions") 42 | parser.add_argument("--data-mb", default=100, type=int, help="size of vector in MBs") 43 | parser.add_argument('--image', 44 | default='Deep Learning AMI (Ubuntu) Version 15.0') 45 | 46 | # internal flags 47 | parser.add_argument('--role', default='launcher', type=str) 48 | parser.add_argument("--sender-ip", default='127.0.0.1') 49 | parser.add_argument("--receiver-ip", default='127.0.0.1') 50 | args = parser.parse_args() 51 | 52 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 53 | 'receiver': [args.receiver_ip + ':32301']} 54 | 55 | 56 | def _launch_server(role): 57 | os.environ['TF_CONFIG'] = json.dumps( 58 | {'cluster': cluster_spec, 59 | 'task': {'type': role, 'index': 0}}) 60 | config = tf.estimator.RunConfig() 61 | return tf.train.Server(config.cluster_spec, 62 | job_name=config.task_type, 63 | task_index=config.task_id) 64 | 65 | 66 | def run_launcher(): 67 | import ncluster 68 | if args.aws: 69 | ncluster.set_backend('aws') 70 | 71 | job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image) 72 | job.upload(__file__) 73 | 74 | sender, receiver = job.tasks 75 | if ncluster.get_backend() == 'aws': 76 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 77 | job.run('source activate tensorflow_p36') 78 | 79 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 80 | receiver.run(f'python tf_adder.py --role=receiver {ip_config}', 81 | non_blocking=True) 82 | sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}') 83 | 84 | 85 | def run_receiver(): 86 | server = _launch_server('receiver') 87 | time.sleep(365 * 24 * 3600) 88 | del server 89 | 90 | 91 | def run_sender(): 92 | param_size = 250 * 1000 * args.data_mb # 1MB is 250k integers 93 | with tf.device('/job:chief/task:0'): 94 | grads = tf.fill([param_size], 1.) 95 | 96 | with tf.device('/job:receiver/task:0'): 97 | params = tf.Variable(tf.ones([param_size])) 98 | add_op = params.assign_add(grads).op 99 | 100 | server = _launch_server('chief') 101 | sess = tf.Session(server.target) 102 | 103 | sess.run(tf.global_variables_initializer()) 104 | 105 | for i in range(args.iters): 106 | start_time = time.time() 107 | sess.run(add_op) 108 | elapsed_time = time.time() - start_time 109 | rate = args.data_mb / elapsed_time 110 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate)) 111 | 112 | 113 | def main(): 114 | # run local benchmark in launcher and launch service 115 | if args.role == "launcher": 116 | run_launcher() 117 | elif args.role == "sender": 118 | run_sender() 119 | elif args.role == "receiver": 120 | run_receiver() 121 | else: 122 | assert False, 'unknown role' 123 | 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /examples/tf_adder_tb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark + TensorBoard. Create sender/receiver tasks and add arrays from sender tasks to 6 | variable on receiver. 7 | 8 | To run locally: 9 | ./tf_adder_tb.py 10 | 11 | Should see something like this 12 | ``` 13 | ... 14 | Benchmark done, tensorboard at http://127.0.0.1:6006 15 | ``` 16 | 17 | 18 | To run on AWS 19 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 20 | 21 | ./tf_adder_tb.py --aws 22 | 23 | After a minute should see something like this 24 | 25 | ... 26 | Benchmark done, tensorboard at http://35.173.134.87:6006 27 | """ 28 | 29 | import argparse 30 | import json 31 | import os 32 | import tensorflow as tf 33 | import time 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--role', default='launcher', type=str) 37 | parser.add_argument("--iters", default=20, help="Maximum number of additions") 38 | parser.add_argument("--data-mb", default=128, help="size of vector in MBs") 39 | parser.add_argument("--sender-ip", default='127.0.0.1') 40 | parser.add_argument("--receiver-ip", default='127.0.0.1') 41 | parser.add_argument("--logdir", help='logging directory') 42 | parser.add_argument("--aws", action='store_true') 43 | parser.add_argument('--image', default='Deep Learning AMI (Amazon Linux) Version 13.0') 44 | args = parser.parse_args() 45 | 46 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 47 | 'receiver': [args.receiver_ip + ':32301']} 48 | 49 | 50 | def _launch_server(role): 51 | os.environ['TF_CONFIG'] = json.dumps( 52 | {'cluster': cluster_spec, 53 | 'task': {'type': role, 'index': 0}}) 54 | config = tf.estimator.RunConfig() 55 | return tf.train.Server(config.cluster_spec, 56 | job_name=config.task_type, 57 | task_index=config.task_id) 58 | 59 | 60 | def run_launcher(): 61 | import ncluster 62 | 63 | if args.aws: 64 | ncluster.set_backend('aws') 65 | job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image) 66 | job.upload(__file__) 67 | this_file = os.path.basename(__file__) 68 | 69 | sender, receiver = job.tasks 70 | if ncluster.get_backend() == 'aws': 71 | # on AWS probably are running in DLAMI, switch into TF-enabled env 72 | job.run('source activate tensorflow_p36') 73 | 74 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 75 | job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) 76 | job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') 77 | job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) 78 | print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006") 79 | 80 | 81 | def run_receiver(): 82 | server = _launch_server('receiver') 83 | time.sleep(365 * 24 * 3600) 84 | del server 85 | 86 | 87 | def run_sender(): 88 | summary_writer = tf.summary.FileWriter(args.logdir) 89 | 90 | param_size = 250 * 1000 * args.data_mb # 1MB is 250k integers 91 | with tf.device('/job:chief/task:0'): 92 | grads = tf.fill([param_size], 1.) 93 | 94 | with tf.device('/job:receiver/task:0'): 95 | params = tf.Variable(tf.ones([param_size])) 96 | add_op = params.assign_add(grads).op 97 | 98 | server = _launch_server('chief') 99 | sess = tf.Session(server.target) 100 | 101 | sess.run(tf.global_variables_initializer()) 102 | 103 | for i in range(args.iters): 104 | start_time = time.time() 105 | sess.run(add_op) 106 | elapsed_time = time.time() - start_time 107 | rate = args.data_mb / elapsed_time 108 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate)) 109 | summary = tf.Summary() 110 | summary.value.add(tag='time_ms', simple_value=elapsed_time * 1000) 111 | summary_writer.add_summary(summary, i) 112 | 113 | summary_writer.close() 114 | 115 | 116 | def main(): 117 | # run local benchmark in launcher and launch service 118 | if args.role == "launcher": 119 | run_launcher() 120 | elif args.role == "sender": 121 | run_sender() 122 | elif args.role == "receiver": 123 | run_receiver() 124 | else: 125 | assert False, 'unknown role' 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /ncluster/__init__.py: -------------------------------------------------------------------------------- 1 | from . import aws_backend 2 | from . import aws_util 3 | from . import util 4 | 5 | from .aws_backend import make_task 6 | from .aws_backend import make_job 7 | 8 | # for type annotations 9 | from .aws_backend import Job 10 | from .aws_backend import Task 11 | 12 | from .aws_util import running_on_aws 13 | from .aws_util import get_zone 14 | from .aws_util import get_region 15 | 16 | from ._version import __version__ 17 | 18 | from .aws_backend import make_job 19 | from .aws_backend import make_task 20 | from .aws_backend import deprecated_set_backend as set_backend 21 | 22 | from . import aws_util as u 23 | 24 | from . import ncluster_globals 25 | 26 | 27 | print(f"ncluster version {__version__}") 28 | 29 | if not util.is_set('NCLUSTER_DISABLE_PDB_HANDLER') and not util.is_set('NCLUSTER_RUNNING_UNDER_CIRCLECI'): 30 | util.install_pdb_handler() # CTRL+\ drops into pdb 31 | 32 | -------------------------------------------------------------------------------- /ncluster/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.91' 2 | -------------------------------------------------------------------------------- /ncluster/local_backend.py: -------------------------------------------------------------------------------- 1 | """EXPERIMENTAL local backend which mirrors aws_backend API. Useful for debugging 2 | 3 | Not thread-safe. 4 | """ 5 | 6 | import glob 7 | import os 8 | import shlex 9 | import socket 10 | import time 11 | from typing import List, Tuple 12 | 13 | from ncluster import ncluster_globals 14 | from . import util 15 | 16 | TASKDIR_ROOT = '/tmp/ncluster/task' 17 | SCRATCH_ROOT = '/tmp/ncluster/scratch' 18 | # DEFAULT_LOGDIR_ROOT = '/ncluster/runs' 19 | 20 | 21 | # todo: tmux session names are backwards from AWS job names (runname-jobname) 22 | # TODO: add kwargs so that tmux backend can be drop-in replacement 23 | 24 | 25 | # TODO: rename extra_kwargs to kwargs everywhere 26 | class Task: 27 | """Local tasks interact with tmux session where session name is derived 28 | from job name, and window names are task ids.""" 29 | tmux_window_id: int 30 | tmux_available_window_ids: List[int] 31 | 32 | def __init__(self, name, *, tmux_session, install_script='', job=None, 33 | **kwargs): 34 | 35 | self.last_status = None 36 | self.homedir = os.environ['HOME'] 37 | self._cmd_fn = None 38 | self._cmd = None 39 | self._status_fn = None # location of output of last status 40 | self._out_fn = None 41 | 42 | self._can_run = False 43 | self.tmux_session = tmux_session 44 | self.tmux_window_id = 0 45 | self.tmux_available_window_ids = [0] 46 | 47 | self.name = name 48 | self.install_script = install_script 49 | self.job = job 50 | self.kwargs = kwargs 51 | 52 | # local servers sometimes listen only on localhost (TensorBoard), and sometimes only on 53 | # externally assigned ip address from gethostbyname (Ray), must choose one, so use the localhost for TB compatibility 54 | # https://github.com/ray-project/ray/issues/1677 55 | self.public_ip = socket.gethostbyname(socket.gethostname()) 56 | # self.public_ip = '127.0.0.1' 57 | self.ip = self.public_ip 58 | 59 | self.connect_instructions = 'tmux a -t ' + self.tmux_session 60 | 61 | # task current dir 62 | print('name is', name) 63 | # tmpdir = f"{util.reverse_taskname(name)}.{os.getpid()}.{util.now_micros()}" 64 | launch_id = util.random_id() 65 | self.taskdir = f"{TASKDIR_ROOT}/{name}-{launch_id}" 66 | self.local_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}" 67 | self.remote_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}" 68 | 69 | self.log(f"Creating taskdir {self.taskdir}") 70 | self._run_raw('mkdir -p ' + self.taskdir) 71 | 72 | self.log(f"Creating scratch {self.local_scratch}") 73 | self._run_raw('rm -Rf ' + self.local_scratch) 74 | self._run_raw('mkdir -p ' + self.local_scratch) 75 | self._run_raw('mkdir -p ' + self.remote_scratch) 76 | self.run_counter = 0 77 | 78 | self._cwd = self.taskdir 79 | self._can_run = True 80 | self.run('cd ' + self.taskdir) 81 | 82 | print("Running install script " + install_script) 83 | self.install_script = install_script 84 | for line in install_script.split('\n'): 85 | self.run(line) 86 | 87 | def run(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs): 88 | 89 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'): 90 | # HACK 91 | if not util.is_bash_builtin(cmd) or True: 92 | return self._run_with_output_on_failure(cmd, non_blocking, ignore_errors, **_kwargs) 93 | else: 94 | self.log("Found bash built-in, using regular run") 95 | 96 | if not self._can_run: 97 | assert False, "Using .run before initialization finished" 98 | if '\n' in cmd: 99 | cmds = cmd.split('\n') 100 | self.log( 101 | f"Running {len(cmds)} commands at once, returning status of last") 102 | status = -1 103 | for subcmd in cmds: 104 | status = self.run(subcmd) 105 | return status 106 | 107 | cmd = cmd.strip() 108 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 109 | return -1 110 | self.run_counter += 1 111 | self.log("tmux> %s", cmd) 112 | 113 | self._cmd = cmd 114 | self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd' 115 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 116 | assert not os.path.exists(self._status_fn) 117 | 118 | cmd = util.shell_strip_comment(cmd) 119 | # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 120 | 121 | self.write(self._cmd_fn, cmd + '\n') 122 | modified_cmd = f'{cmd} ; echo $? > {self._status_fn}' 123 | modified_cmd = shlex.quote(modified_cmd) 124 | 125 | tmux_window = self.tmux_session+':'+str(self.tmux_window_id) 126 | tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter' 127 | self._run_raw(tmux_cmd, ignore_errors=ignore_errors) 128 | if non_blocking: 129 | return 0 130 | 131 | if not self.wait_for_file(self._status_fn, max_wait_sec=60): 132 | self.log(f"Retrying waiting for {self._status_fn}") 133 | while not self.exists(self._status_fn): 134 | self.log(f"Still waiting for {cmd}") 135 | self.wait_for_file(self._status_fn, max_wait_sec=60) 136 | contents = self.read(self._status_fn) 137 | 138 | # if empty wait a bit to allow for race condition 139 | if len(contents) == 0: 140 | time.sleep(0.01) 141 | status = int(open(self._status_fn).read().strip()) 142 | self.last_status = status 143 | 144 | if status != 0: 145 | if not ignore_errors: 146 | raise RuntimeError(f"Command {cmd} returned status {status}") 147 | else: 148 | self.log(f"Warning: command {cmd} returned status {status}") 149 | 150 | return status 151 | 152 | def join(self, ignore_errors=False): 153 | """Waits until last executed command completed.""" 154 | assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it" 155 | check_interval = 0.2 156 | status_fn = self._status_fn 157 | if not self.wait_for_file(status_fn, max_wait_sec=30): 158 | self.log(f"Retrying waiting for {status_fn}") 159 | while not self.exists(status_fn): 160 | self.log(f"Still waiting for {self._cmd}") 161 | self.wait_for_file(status_fn, max_wait_sec=30) 162 | contents = self.read(status_fn) 163 | 164 | # if empty wait a bit to allow for race condition 165 | if len(contents) == 0: 166 | time.sleep(check_interval) 167 | contents = self.read(status_fn) 168 | status = int(contents.strip()) 169 | self.last_status = status 170 | 171 | if status != 0: 172 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 173 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'): 174 | self.log( 175 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 176 | self.log(f"\n{'*'*80}\nEnd failing output") 177 | if not ignore_errors: 178 | raise RuntimeError(f"Command {self._cmd} returned status {status}") 179 | else: 180 | self.log(f"Warning: command {self._cmd} returned status {status}") 181 | 182 | return status 183 | 184 | def switch_window(self, window_id: int): 185 | """ 186 | Switches currently active tmux window for given task. 0 is the default window 187 | Args: 188 | window_id: integer id of tmux window to use 189 | """ 190 | 191 | # windows are numbered sequentially 0, 1, 2, ... 192 | # create any missing windows and make them point to the same directory 193 | if window_id not in self.tmux_available_window_ids: 194 | for i in range(max(self.tmux_available_window_ids)+1, window_id+1): 195 | self._run_raw(f'tmux new-window -t {self.tmux_session} -d') 196 | 197 | tmux_window = self.tmux_session + ':' + str(i) 198 | cmd = shlex.quote(f'cd {self.taskdir}') 199 | tmux_cmd = f'tmux send-keys -t {tmux_window} {cmd} Enter' 200 | self._run_raw(tmux_cmd) 201 | self.tmux_available_window_ids.append(i) 202 | 203 | self.tmux_window_id = window_id 204 | 205 | # This is a future "run" command, will become "run" once all cases are checked 206 | def _run_with_output_on_failure(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs) -> str: 207 | if not self._can_run: 208 | assert False, "Using .run before initialization finished" 209 | if '\n' in cmd: 210 | cmds = cmd.split('\n') 211 | self.log( 212 | f"Running {len(cmds)} commands at once, returning status of last") 213 | status = -1 214 | for subcmd in cmds: 215 | status = self.run(subcmd) 216 | return status 217 | 218 | cmd = cmd.strip() 219 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 220 | return '' 221 | self.run_counter += 1 222 | self.log("tmux> %s", cmd) 223 | 224 | self._cmd = cmd 225 | self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd' 226 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 227 | self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out' 228 | assert not os.path.exists(self._status_fn) 229 | 230 | cmd = util.shell_strip_comment(cmd) 231 | # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 232 | 233 | self.write(self._cmd_fn, cmd + '\n') 234 | # modified_cmd = f'{cmd} ; echo $? > {self._status_fn}' 235 | modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}' 236 | modified_cmd = shlex.quote(modified_cmd) 237 | 238 | tmux_window = self.tmux_session+':'+str(self.tmux_window_id) 239 | tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter' 240 | self._run_raw(tmux_cmd) 241 | if non_blocking: 242 | return '' 243 | 244 | if not self.wait_for_file(self._status_fn, max_wait_sec=60): 245 | self.log(f"Retrying waiting for {self._status_fn}") 246 | while not self.exists(self._status_fn): 247 | self.log(f"Still waiting for {cmd}") 248 | self.wait_for_file(self._status_fn, max_wait_sec=60) 249 | contents = self.read(self._status_fn) 250 | 251 | # if empty wait a bit to allow for race condition 252 | if len(contents) == 0: 253 | time.sleep(0.01) 254 | status = int(open(self._status_fn).read().strip()) 255 | self.last_status = status 256 | 257 | if status != 0: 258 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 259 | self.log( 260 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 261 | self.log(f"\n{'*'*80}\nEnd failing output") 262 | if not ignore_errors: 263 | raise RuntimeError(f"Command {cmd} returned status {status}") 264 | else: 265 | self.log(f"Warning: command {cmd} returned status {status}") 266 | 267 | return self.read(self._out_fn) 268 | 269 | def _run_raw(self, cmd, ignore_errors=False): 270 | """Runs command directly, skipping tmux interface""" 271 | # TODO: capture stdout/stderr for feature parity with aws_backend 272 | result = os.system(cmd) 273 | if result != 0: 274 | if ignore_errors: 275 | self.log(f"command ({cmd}) failed.") 276 | assert False, "_run_raw failed" 277 | 278 | def rsync(self, local_fn, remote_fn=None): 279 | self.upload(local_fn, remote_fn) 280 | 281 | def upload(self, local_fn, remote_fn=None, dont_overwrite=False): 282 | """Uploads file to remote instance. If location not specified, dumps it 283 | into default directory. Creates missing directories in path name.""" 284 | 285 | # support wildcard through glob 286 | if '*' in local_fn: 287 | for local_subfn in glob.glob(local_fn): 288 | self.upload(local_subfn) 289 | return 290 | 291 | if remote_fn is None: 292 | remote_fn = os.path.basename(local_fn) 293 | 294 | if dont_overwrite and self.exists(remote_fn): 295 | self.log("Remote file %s exists, skipping" % (remote_fn,)) 296 | return 297 | 298 | if not remote_fn.startswith('/'): 299 | remote_fn = self.taskdir + '/' + remote_fn 300 | 301 | remote_fn = remote_fn.replace('~', self.homedir) 302 | self.log('uploading ' + local_fn + ' to ' + remote_fn) 303 | 304 | local_fn = os.path.abspath(local_fn) 305 | self._run_raw("cp -R %s %s" % (local_fn, remote_fn)) 306 | 307 | def download(self, remote_fn, local_fn='.'): 308 | if local_fn == '.': 309 | local_fn = self._cwd 310 | # self.log("downloading %s to %s" % (remote_fn, local_fn)) 311 | if not remote_fn.startswith('/'): 312 | remote_fn = self._cwd + '/' + remote_fn 313 | if self.exists(remote_fn): 314 | os.system(f'cp {remote_fn} {local_fn}') 315 | else: 316 | raise RuntimeError(f"No such file {remote_fn}") 317 | 318 | @staticmethod 319 | def exists(remote_fn): 320 | return os.path.exists(remote_fn) 321 | 322 | def read(self, remote_fn): 323 | tmp_fn = self.local_scratch + '/' + str(util.now_micros()) 324 | self.download(remote_fn, tmp_fn) 325 | return open(tmp_fn).read() 326 | 327 | def write(self, remote_fn, contents): 328 | def make_temp_fn(): 329 | """Returns temporary filename for this task.""" 330 | return self.local_scratch + '/write.' + str(util.now_micros()) 331 | 332 | tmp_fn = make_temp_fn() 333 | open(tmp_fn, 'w').write(contents) 334 | self.upload(tmp_fn, remote_fn) 335 | 336 | # don't include file streaming for now 337 | # the issue is that file streaming by default turns on 4K buffering, which makes 338 | # streaming a lot less useful. Similar buffering is turned on for piping commands 339 | # https://unix.stackexchange.com/questions/25372/turn-off-buffering-in-pipe 340 | # def file_stream(self, fn: str) -> None: 341 | # # if not fn.startswith('/'): 342 | # # fn = self.taskdir + '/' + fn 343 | # 344 | # if not os.path.exists(fn): 345 | # os.system('mkdir -p ' + os.path.dirname(os.path.abspath(fn))) 346 | # os.system('touch ' + fn) 347 | # 348 | # p = subprocess.Popen(['tail', '-f', fn], stdout=subprocess.PIPE) 349 | # 350 | # for line in iter(p.stdout.readline, ''): 351 | # sys.stdout.write(line.decode('ascii', errors='ignore')) 352 | 353 | @property 354 | def logdir(self): 355 | """Returns logging directory, creating one if necessary. See "Logdir" section of design doc on naming convention.""" 356 | 357 | run_name = ncluster_globals.get_run_for_task(self) 358 | logdir = ncluster_globals.get_logdir(run_name) 359 | if logdir: 360 | return logdir 361 | 362 | # create logdir. Only single task in a group creates the logdir 363 | if ncluster_globals.is_chief(self, run_name): 364 | chief = self 365 | else: 366 | chief = ncluster_globals.get_chief(run_name) 367 | 368 | chief.setup_logdir() 369 | return ncluster_globals.get_logdir(run_name) 370 | # release lock 371 | 372 | def setup_logdir(self): 373 | # todo: locking on logdir creation 374 | 375 | """Create logdir for task/job/run. No-op if the task is not chief (0'th task of 0'th job of run) 376 | """ 377 | run_name = ncluster_globals.get_run_for_task(self) 378 | self.log("Creating logdir for run "+run_name) 379 | logdir_root = ncluster_globals.LOGDIR_ROOT 380 | assert logdir_root, "LOGDIR_ROOT not set, make sure you have called ncluster.set_backend()" 381 | 382 | self.run(f'mkdir -p {logdir_root}') 383 | find_command = f'find {logdir_root} -maxdepth 1 -type d' 384 | 385 | stdout, stderr = self.run_with_output(find_command) 386 | logdir = f"{logdir_root}/{run_name}" 387 | 388 | counter = 0 389 | while logdir in stdout: 390 | counter += 1 391 | new_logdir = f'{logdir_root}/{run_name}.{counter:02d}' 392 | self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}') 393 | logdir = new_logdir 394 | self.run(f'mkdir -p {logdir}') 395 | 396 | ncluster_globals.set_logdir(run_name, logdir) 397 | return logdir 398 | 399 | def log(self, message, *args): 400 | """Log to launcher console.""" 401 | if args: 402 | message %= args 403 | 404 | print(f"{util.current_timestamp()} {self.name}: {message}") 405 | 406 | def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365, 407 | check_interval: float = 0.02) -> bool: 408 | """ 409 | Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec 410 | Args: 411 | fn: filename on task machine 412 | max_wait_sec: how long to wait in seconds 413 | check_interval: how often to check in seconds 414 | Returns: 415 | False if waiting was was cut short by max_wait_sec limit, True otherwise 416 | """ 417 | # print("Waiting for file", fn) 418 | start_time = time.time() 419 | while True: 420 | if time.time() - start_time > max_wait_sec: 421 | util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}") 422 | return False 423 | if not self.exists(fn): 424 | time.sleep(check_interval) 425 | continue 426 | else: 427 | break 428 | return True 429 | 430 | # TODO: reuse regular run 431 | def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \ 432 | Tuple[str, str]: 433 | """ 434 | 435 | Args: 436 | cmd: single line shell command to run 437 | non_blocking (bool): if True, does not wait for command to finish 438 | ignore_errors: if True, will succeed even if command failed 439 | 440 | Returns: 441 | Contents of stdout/stderr as strings. 442 | Raises 443 | RuntimeException: if command produced non-0 returncode 444 | 445 | """ 446 | 447 | assert '\n' not in cmd, "Do not support multi-line commands" 448 | cmd: str = cmd.strip() 449 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 450 | return '', '' 451 | 452 | stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout" 453 | stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr" 454 | cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}" 455 | 456 | assert not non_blocking, "Getting output doesn't work with non_blocking" 457 | status = self.run(cmd2, False, ignore_errors=True) 458 | stdout = self.read(stdout_fn) 459 | stderr = self.read(stderr_fn) 460 | 461 | if self.last_status > 0: 462 | self.log(f"Warning: command '{cmd}' returned {status}," 463 | f" stdout was '{stdout}' stderr was '{stderr}'") 464 | if not ignore_errors: 465 | raise RuntimeError(f"Warning: command '{cmd}' returned {status}," 466 | f" stdout was '{stdout}' stderr was '{stderr}'") 467 | 468 | return stdout, stderr 469 | 470 | 471 | def make_task(name='', 472 | run_name='', 473 | **kwargs) -> Task: 474 | """Create task, also create dummy run if not specified.""" 475 | ncluster_globals.task_launched = True 476 | 477 | name = ncluster_globals.auto_assign_task_name_if_needed(name) 478 | 479 | # tmux can't use . for session names 480 | tmux_session = name.replace('.', '=') 481 | tmux_window_id = 0 482 | util.log(f'killing session {tmux_session}') 483 | 484 | if not util.is_set("NCLUSTER_NOKILL_TMUX"): 485 | os.system(f'tmux kill-session -t {tmux_session}') 486 | os.system(f'tmux new-session -s {tmux_session} -n {tmux_window_id} -d') 487 | 488 | task = Task(name, 489 | tmux_session=tmux_session, # propagate optional args 490 | run_name=run_name, 491 | **kwargs) 492 | ncluster_globals.register_task(task, run_name) 493 | return task 494 | -------------------------------------------------------------------------------- /ncluster/ncluster_cloud_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Creates resources. 4 | # To run standalone: 5 | # python -m ncluster.ncluster_cloud_setup 6 | # 7 | # This script creates VPC/security group/keypair if not already present 8 | 9 | import os 10 | import sys 11 | import time 12 | from typing import Tuple, Any, Optional 13 | 14 | from boto3_type_annotations.ec2 import SecurityGroup 15 | 16 | from ncluster import aws_util as u 17 | from ncluster import util 18 | 19 | DRYRUN = False 20 | DEBUG = True 21 | 22 | # Names of Amazon resources that are created. These settings are fixed across 23 | # all runs, and correspond to resources created once per user per region. 24 | 25 | NFS_PORT = 2049 26 | PUBLIC_TCP_RANGES = [ 27 | 22, # ssh 28 | NFS_PORT, # NFS port NFS peering between security groups 29 | # ipython notebook ports 30 | (8888, 8899), 31 | # redis port 32 | 6379, 33 | # tensorboard ports 34 | (6006, 6016) 35 | ] 36 | 37 | PUBLIC_UDP_RANGES = [NFS_PORT, (60000, 61000)] # mosh ports 38 | 39 | 40 | def network_setup() -> Tuple[Any, Any]: 41 | """Creates VPC if it doesn't already exists, configures it for public 42 | internet access, returns vpc, subnet, security_group""" 43 | 44 | # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 45 | 46 | ec2 = u.get_ec2_resource() 47 | client = u.get_ec2_client() 48 | existing_vpcs = u.get_vpc_dict() 49 | zones = u.get_zones() 50 | 51 | # create VPC from scratch. Remove this if default VPC works well enough. 52 | create_non_default_vpc = False 53 | 54 | if create_non_default_vpc: 55 | vpc_name = u.get_vpc_name() 56 | if u.get_vpc_name() in existing_vpcs: 57 | print("Reusing VPC " + vpc_name) 58 | vpc = existing_vpcs[vpc_name] 59 | subnets = list(vpc.subnets.all()) 60 | assert len(subnets) == len( 61 | zones), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( 62 | len(subnets), len(zones)) 63 | 64 | else: 65 | print("Creating VPC " + vpc_name) 66 | vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') 67 | 68 | # enable DNS on the VPC 69 | response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) 70 | assert u.is_good_response(response) 71 | response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) 72 | assert u.is_good_response(response) 73 | 74 | vpc.create_tags(Tags=u.create_name_tags(vpc_name)) 75 | vpc.wait_until_available() 76 | 77 | gateways = u.get_gateway_dict(vpc) 78 | gateway_name = u.get_gateway_name() 79 | if gateway_name in gateways: 80 | print("Reusing gateways " + gateway_name) 81 | else: 82 | print("Creating internet gateway " + gateway_name) 83 | ig = ec2.create_internet_gateway() 84 | ig.attach_to_vpc(VpcId=vpc.id) 85 | ig.create_tags(Tags=u.create_name_tags(gateway_name)) 86 | 87 | # check that attachment succeeded 88 | attach_state = u.extract_attr_for_match(ig.attachments, State=-1, 89 | VpcId=vpc.id) 90 | assert attach_state == 'available', "vpc %s is in state %s" % (vpc.id, 91 | attach_state) 92 | route_table = vpc.create_route_table() 93 | route_table_name = u.get_route_table_name() 94 | route_table.create_tags(Tags=u.create_name_tags(route_table_name)) 95 | 96 | dest_cidr = '0.0.0.0/0' 97 | route_table.create_route( 98 | DestinationCidrBlock=dest_cidr, 99 | GatewayId=ig.id 100 | ) 101 | # check success 102 | for route in route_table.routes: 103 | # result looks like this 104 | # ec2.Route(route_table_id='rtb-a8b438cf', 105 | # destination_cidr_block='0.0.0.0/0') 106 | if route.destination_cidr_block == dest_cidr: 107 | break 108 | else: 109 | # sometimes get 110 | # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] 111 | # TODO: add a wait/retry? 112 | assert False, "Route for %s not found in %s" % (dest_cidr, 113 | route_table.routes) 114 | 115 | assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 116 | ip = 0 117 | for zone in zones: 118 | cidr_block = '192.168.%d.0/20' % (ip,) 119 | ip += 16 120 | print("Creating subnet %s in zone %s" % (cidr_block, zone)) 121 | subnet = vpc.create_subnet(CidrBlock=cidr_block, 122 | AvailabilityZone=zone) 123 | subnet.create_tags(Tags=[{'Key': 'Name', 'Value': f'{vpc_name}-subnet'}, 124 | {'Key': 'Region', 'Value': zone}]) 125 | response = client.modify_subnet_attribute( 126 | MapPublicIpOnLaunch={'Value': True}, 127 | SubnetId=subnet.id 128 | ) 129 | assert u.is_good_response(response) 130 | u.wait_until_available(subnet) 131 | assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?" 132 | 133 | route_table.associate_with_subnet(SubnetId=subnet.id) 134 | 135 | # Setup security group for non-default VPC 136 | # existing_security_groups = u.get_security_group_dict() 137 | # security_group_nd_name = u.get_security_group_nd_name() 138 | # if security_group_nd_name in existing_security_groups: 139 | # print("Reusing non-default security group " + security_group_nd_name) 140 | # security_group_nd = existing_security_groups[security_group_nd_name] 141 | # assert security_group_nd.vpc_id == vpc.id, f"Found non-default security group {security_group_nd} " \ 142 | # f"attached to {security_group_nd.vpc_id} but expected {vpc.id}" 143 | # else: 144 | # security_group_nd = create_security_group(security_group_nd_name, vpc.id) 145 | 146 | # Setup things on default VPC for zone-agnostic launching 147 | vpc = u.get_default_vpc() 148 | if not vpc: 149 | util.log(f"Creating default VPC for region {u.get_region()}") 150 | client.create_default_vpc() 151 | vpc = u.get_default_vpc() 152 | assert vpc, "Could not create default VPC?" 153 | 154 | existing_security_groups = u.get_security_group_dict() 155 | security_group_name = u.get_security_group_name() 156 | if security_group_name in existing_security_groups: 157 | print("Reusing security group " + security_group_name) 158 | security_group = existing_security_groups[security_group_name] 159 | assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \ 160 | f"attached to {security_group.vpc_id} but expected {vpc.id}" 161 | else: 162 | security_group = create_security_group(security_group_name, vpc.id) 163 | # Uncomment the following when setting up two VPC's 164 | # security_group = create_security_group(security_group_name, vpc.id, security_group_nd) 165 | 166 | return vpc, security_group 167 | 168 | 169 | def keypair_setup(): 170 | """Creates keypair if necessary, saves private key locally, returns contents 171 | of private key file.""" 172 | 173 | os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION) 174 | keypair_name = u.get_keypair_name() 175 | keypair = u.get_keypair_dict().get(keypair_name, None) 176 | keypair_fn = u.get_keypair_fn() 177 | if keypair: 178 | print("Reusing keypair " + keypair_name) 179 | # check that local pem file exists and is readable 180 | assert os.path.exists( 181 | keypair_fn), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % ( 182 | keypair_name, keypair_fn, keypair_name) 183 | keypair_contents = open(keypair_fn).read() 184 | assert len(keypair_contents) > 0 185 | else: 186 | print("Creating keypair " + keypair_name) 187 | ec2 = u.get_ec2_resource() 188 | assert not os.path.exists( 189 | keypair_fn), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % ( 190 | keypair_fn) 191 | keypair = ec2.create_key_pair(KeyName=keypair_name) 192 | 193 | open(keypair_fn, 'w').write(keypair.key_material) 194 | os.system('chmod 400 ' + keypair_fn) 195 | 196 | return keypair 197 | 198 | 199 | def placement_group_setup(group_name): 200 | """Creates placement_group group if necessary. Returns True if new placement_group 201 | group was created, False otherwise.""" 202 | 203 | existing_placement_groups = u.get_placement_group_dict() 204 | 205 | group = existing_placement_groups.get(group_name, None) 206 | if group: 207 | assert group.state == 'available' 208 | assert group.strategy == 'cluster' 209 | print("Reusing group ", group.name) 210 | return group 211 | 212 | print("Creating group " + group_name) 213 | ec2 = u.get_ec2_resource() 214 | group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster') 215 | return group 216 | 217 | 218 | def create_security_group(security_group_name: str, vpc_id: str, other_group: Optional[SecurityGroup] = None): 219 | """Creates security group with proper ports open. Optionally allows all traffic from other_group""" 220 | print("Creating security group " + security_group_name) 221 | ec2 = u.get_ec2_resource() 222 | 223 | security_group: SecurityGroup = ec2.create_security_group( 224 | GroupName=security_group_name, Description=security_group_name, 225 | VpcId=vpc_id) 226 | 227 | security_group.create_tags(Tags=u.create_name_tags(security_group_name)) 228 | 229 | # allow ICMP access for public ping 230 | security_group.authorize_ingress( 231 | CidrIp='0.0.0.0/0', 232 | IpProtocol='icmp', 233 | FromPort=-1, 234 | ToPort=-1 235 | ) 236 | 237 | # open public ports 238 | # always include SSH port which is required for basic functionality 239 | assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" 240 | for port in PUBLIC_TCP_RANGES: 241 | if util.is_iterable(port): 242 | assert len(port) == 2 243 | from_port, to_port = port 244 | else: 245 | from_port, to_port = port, port 246 | 247 | response = security_group.authorize_ingress(IpProtocol="tcp", 248 | CidrIp="0.0.0.0/0", 249 | FromPort=from_port, 250 | ToPort=to_port) 251 | assert u.is_good_response(response) 252 | 253 | for port in PUBLIC_UDP_RANGES: 254 | if util.is_iterable(port): 255 | assert len(port) == 2 256 | from_port, to_port = port 257 | else: 258 | from_port, to_port = port, port 259 | 260 | response = security_group.authorize_ingress(IpProtocol="udp", 261 | CidrIp="0.0.0.0/0", 262 | FromPort=from_port, 263 | ToPort=to_port) 264 | assert u.is_good_response(response) 265 | 266 | def authorize_from_group(this_security_group: SecurityGroup, other_security_group: SecurityGroup): 267 | """Helper function to authorize all traffic from other_group. Can be used to authorized within-group traffic as 268 | authorize_from_group(group, group)""" 269 | 270 | # Authorizing ingress doesn't work with security group names in a non-default VPC, 271 | # so must use more complicated syntax: https://github.com/boto/boto3/issues/158 272 | response_ = {} 273 | for protocol in ['icmp']: 274 | try: 275 | rule = {'FromPort': -1, 276 | 'IpProtocol': protocol, 277 | 'IpRanges': [], 278 | 'PrefixListIds': [], 279 | 'ToPort': -1, 280 | 'UserIdGroupPairs': [{'GroupId': other_security_group.id}]} 281 | response_ = this_security_group.authorize_ingress(IpPermissions=[rule]) 282 | except Exception as e: 283 | if response_['Error']['Code'] == 'InvalidPermission.Duplicate': 284 | print("Warning, got " + str(e)) 285 | else: 286 | assert False, "Failed while authorizing icml ingress with " + str(e) 287 | 288 | for protocol in ['tcp', 'udp']: 289 | try: 290 | rule = {'FromPort': 0, 291 | 'IpProtocol': protocol, 292 | 'IpRanges': [], 293 | 'PrefixListIds': [], 294 | 'ToPort': 65535, 295 | 'UserIdGroupPairs': [{'GroupId': other_security_group.id}]} 296 | response_ = this_security_group.authorize_ingress(IpPermissions=[rule]) 297 | except Exception as e: 298 | if response_['Error']['Code'] == 'InvalidPermission.Duplicate': 299 | print("Warning, got " + str(e)) 300 | else: 301 | assert False, "Failed while authorizing tcp/udp ingress with " + str(e) 302 | 303 | # authorize EFA traffic 304 | user_id = u.get_account_number() 305 | response = None 306 | try: 307 | rule = { 308 | "IpProtocol": "-1", 309 | "Ipv6Ranges": [], 310 | "PrefixListIds": [], 311 | 'UserIdGroupPairs': [{'Description': 'efa', 'GroupId': other_security_group.id, 'UserId': user_id}] 312 | } 313 | response_ = this_security_group.authorize_ingress(IpPermissions=[rule]) 314 | assert u.is_good_response(response_), str(response) 315 | 316 | rule = { 317 | "IpProtocol": "-1", 318 | "PrefixListIds": [], 319 | 'UserIdGroupPairs': [{'Description': 'efa', 320 | 'GroupId': other_security_group.id, 321 | 'UserId': user_id}] 322 | } 323 | response_ = this_security_group.authorize_egress(IpPermissions=[rule]) 324 | assert u.is_good_response(response_), str(response) 325 | 326 | except Exception as e: 327 | if 'Error' in response_ and 'Code' in response['Error'] and response_['Error']['Code'] == 'InvalidPermission.Duplicate': 328 | print(f"Warning while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to " 329 | f"{other_security_group.description} ({other_security_group.id}) with message '{e}'") 330 | 331 | else: 332 | assert False, (f"Failed while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to " 333 | f"{other_security_group.description} ({other_security_group.id}) with message '{e}' and response '{response}'") 334 | 335 | authorize_from_group(security_group, security_group) 336 | # if using multiple security groups, which is required for the case of default + non-default VPC 337 | # also authorize all traffic between them 338 | if other_group: 339 | authorize_from_group(security_group, other_group) 340 | authorize_from_group(other_group, security_group) 341 | 342 | return security_group 343 | 344 | 345 | def create_resources(): 346 | print(f"Creating {u.get_prefix()} resources in region {u.get_region()}") 347 | 348 | vpc, security_group = network_setup() 349 | keypair_setup() # saves private key locally to keypair_fn 350 | 351 | # create EFS 352 | efss = u.get_efs_dict() 353 | efs_name = u.get_efs_name() 354 | efs_id = efss.get(efs_name, '') 355 | if not efs_id: 356 | print("Creating EFS " + efs_name) 357 | efs_id = u.create_efs(efs_name) 358 | else: 359 | print("Reusing EFS " + efs_name) 360 | 361 | efs_client = u.get_efs_client() 362 | 363 | # create mount target for each subnet in the VPC 364 | 365 | # added retries because efs is not immediately available 366 | max_failures = 10 367 | retry_interval_sec = 1 368 | for subnet in vpc.subnets.all(): 369 | for retry_attempt in range(max_failures): 370 | try: 371 | sys.stdout.write( 372 | "Creating efs mount target for %s ... " % (subnet.availability_zone,)) 373 | sys.stdout.flush() 374 | response = efs_client.create_mount_target(FileSystemId=efs_id, 375 | SubnetId=subnet.id, 376 | SecurityGroups=[ 377 | security_group.id]) 378 | if u.is_good_response(response): 379 | print("success") 380 | break 381 | except Exception as e: 382 | if 'already exists' in str(e): # ignore "already exists" errors 383 | print('already exists') 384 | break 385 | 386 | # Takes couple of seconds for EFS to come online, with 387 | # errors like this: 388 | # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec 389 | 390 | print("Got %s, retrying in %s sec" % (str(e), retry_interval_sec)) 391 | time.sleep(retry_interval_sec) 392 | else: 393 | print("Giving up.") 394 | 395 | 396 | if __name__ == '__main__': 397 | create_resources() 398 | -------------------------------------------------------------------------------- /ncluster/ncluster_cloud_wipe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Deletes resources 4 | 5 | import sys 6 | import os 7 | import argparse 8 | 9 | from ncluster import aws_util as u 10 | from ncluster import util 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--kind', type=str, default='all', 14 | help="which resources to delete, all/network/keypair/efs") 15 | parser.add_argument('--force_delete_efs', action='store_true', 16 | help="force deleting main EFS") 17 | args = parser.parse_args() 18 | 19 | EFS_NAME = u.get_prefix() 20 | VPC_NAME = u.get_prefix() 21 | SECURITY_GROUP_NAME = u.get_prefix() 22 | ROUTE_TABLE_NAME = u.get_prefix() 23 | KEYPAIR_NAME = u.get_keypair_name() 24 | 25 | client = u.get_ec2_client() 26 | ec2 = u.get_ec2_resource() 27 | 28 | 29 | def response_type(response): 30 | return 'ok' if u.is_good_response(response) else 'failed' 31 | 32 | 33 | def delete_efs(): 34 | efss = u.get_efs_dict() 35 | efs_id = efss.get(EFS_NAME, '') 36 | efs_client = u.get_efs_client() 37 | if efs_id: 38 | try: 39 | # delete mount targets first 40 | print("About to delete %s (%s)" % (efs_id, EFS_NAME)) 41 | response = efs_client.describe_mount_targets(FileSystemId=efs_id) 42 | assert u.is_good_response(response) 43 | for mount_response in response['MountTargets']: 44 | id_ = mount_response['MountTargetId'] 45 | sys.stdout.write('Deleting mount target %s ... ' % (id_,)) 46 | sys.stdout.flush() 47 | response = efs_client.delete_mount_target(MountTargetId=id_) 48 | print(response_type(response)) 49 | 50 | sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, EFS_NAME)) 51 | sys.stdout.flush() 52 | u.delete_efs_by_id(efs_id) 53 | 54 | except Exception as e: 55 | sys.stdout.write(f'failed with {e}\n') 56 | util.log_error(str(e) + '\n') 57 | 58 | 59 | def delete_network(): 60 | if u.get_region() == 'us-east-1': 61 | util.log("(Internal safety switch. Not deleting resources in us-east-1, remove this line if you are really sure") 62 | return 63 | 64 | def delete_vpc(vpc, partial=True): 65 | """Deletes VPC + all resources, if "partial" set to True, only deletes associated security groups 66 | """ 67 | print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) 68 | 69 | # don't modify default VPC 70 | if not partial: 71 | for subnet in vpc.subnets.all(): 72 | try: 73 | sys.stdout.write("Deleting subnet %s ... " % subnet.id) 74 | sys.stdout.write(response_type(subnet.delete()) + '\n') 75 | except Exception as e: 76 | sys.stdout.write('failed\n') 77 | util.log_error(str(e) + '\n') 78 | 79 | for gateway in vpc.internet_gateways.all(): 80 | sys.stdout.write("Deleting gateway %s ... " % gateway.id) 81 | # note: if instances are using VPC, this fails with 82 | # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. 83 | 84 | sys.stdout.write('detached ... ' if u.is_good_response( 85 | gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') 86 | sys.stdout.write('deleted ' if u.is_good_response( 87 | gateway.delete()) else ' delete_failed ') 88 | sys.stdout.write('\n') 89 | 90 | def desc(): 91 | return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) 92 | 93 | for route_table in vpc.route_tables.all(): 94 | sys.stdout.write(f"Deleting route table {desc()} ... ") 95 | try: 96 | sys.stdout.write(response_type(route_table.delete()) + '\n') 97 | except Exception as e: 98 | sys.stdout.write('failed\n') 99 | util.log_error(str(e) + '\n') 100 | 101 | else: 102 | util.log(f"vpc {vpc.id} is a default VPC, only doing partial deletion") 103 | 104 | def desc(): 105 | return "%s (%s, %s)" % ( 106 | security_group.id, u.get_name(security_group.tags), 107 | security_group.group_name) 108 | 109 | ncluster_security_groups = u.get_security_group_names() 110 | for security_group in vpc.security_groups.all(): 111 | # default group is undeletable, skip 112 | if security_group.group_name == 'default': 113 | continue 114 | 115 | # don't delete groups created outside of ncluster framework 116 | if security_group.group_name not in ncluster_security_groups: 117 | continue 118 | 119 | sys.stdout.write( 120 | 'Deleting security group %s ... ' % (desc())) 121 | try: 122 | sys.stdout.write(response_type(security_group.delete()) + '\n') 123 | except Exception as e: 124 | sys.stdout.write('failed\n') 125 | util.log_error(str(e) + '\n') 126 | 127 | if not partial: 128 | sys.stdout.write("Deleting VPC %s ... " % vpc.id) 129 | try: 130 | sys.stdout.write(response_type(vpc.delete()) + '\n') 131 | except Exception as e: 132 | sys.stdout.write('failed\n') 133 | util.log_error(str(e) + '\n') 134 | 135 | existing_vpcs = u.get_vpc_dict() 136 | if VPC_NAME in existing_vpcs: 137 | # delete ncluster VPC 138 | delete_vpc(ec2.Vpc(existing_vpcs[VPC_NAME].id), partial=False) 139 | 140 | # delete ncluster resources on default VPC (partial=True) 141 | delete_vpc(u.get_default_vpc()) 142 | 143 | 144 | def delete_keypair(): 145 | keypairs = u.get_keypair_dict() 146 | keypair = keypairs.get(KEYPAIR_NAME, '') 147 | if keypair: 148 | try: 149 | sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, 150 | KEYPAIR_NAME)) 151 | sys.stdout.write(response_type(keypair.delete()) + '\n') 152 | except Exception as e: 153 | sys.stdout.write('failed\n') 154 | util.log_error(str(e) + '\n') 155 | 156 | keypair_fn = u.get_keypair_fn() 157 | if os.path.exists(keypair_fn): 158 | print("Deleting local keypair file %s" % (keypair_fn,)) 159 | os.system('rm -f ' + keypair_fn) 160 | 161 | 162 | def delete_resources(force_delete_efs=False): 163 | region = os.environ['AWS_DEFAULT_REGION'] 164 | 165 | resource = u.get_prefix() 166 | answer = input(f"Deleting resources for account {u.get_account_number()}:{u.get_account_name()}, region {u.get_region()}, sure? (y/N) ") 167 | 168 | if util.is_set("NCLUSTER_SKIP_CONFIRMATION"): 169 | print("NCLUSTER_SKIP_CONFIRMATION is set, skipping confirmation") 170 | answer = 'y' 171 | 172 | if not answer.lower() == "y": 173 | print("Didn't get y, doing nothing") 174 | return 175 | 176 | print(f"Deleting {resource} resources in region {region}") 177 | print(f"Make sure {resource} instances are terminated or this will fail.") 178 | 179 | if 'efs' in args.kind or 'all' in args.kind: 180 | if EFS_NAME == u.DEFAULT_PREFIX and not force_delete_efs: 181 | # this is default EFS, likely has stuff, require extra flag to delete it 182 | print("default EFS has useful stuff in it, not deleting it. Use force_delete_efs " 183 | "flag to force. This means security group deletion will fail as well.") 184 | else: 185 | delete_efs() 186 | if 'network' in args.kind or 'all' in args.kind: 187 | delete_network() 188 | if 'keypair' in args.kind or 'all' in args.kind: 189 | delete_keypair() 190 | 191 | 192 | if __name__ == '__main__': 193 | delete_resources(force_delete_efs=args.force_delete_efs) 194 | -------------------------------------------------------------------------------- /ncluster/ncluster_globals.py: -------------------------------------------------------------------------------- 1 | """Module that keeps global state of ncluster tasks, such as naming, 2 | connection of tasks to runs. 3 | 4 | run refers to string name 5 | run_object refers to Run object corresponding to that name 6 | 7 | """ 8 | import os 9 | import sys 10 | from typing import Dict, Any, List 11 | 12 | from . import aws_backend as backend 13 | from . import util 14 | 15 | task_launched = False # keep track whether anything has been launched 16 | 17 | task_counter = 0 18 | job_counter = 0 19 | run_counter = 0 20 | 21 | run_dict: Dict[str, Any] = {} 22 | task_run_dict: Dict["backend.Task", str] = {} 23 | run_task_dict: Dict[str, List["backend.Task"]] = {} 24 | run_logdir_dict: Dict[str, str] = {} 25 | 26 | tasks_seen: List["backend.Task"] = [] # list of all tasks created 27 | 28 | enforce_placement_group_val = False 29 | 30 | 31 | def enforce_placement_group(): 32 | """Enforces all tasks to be launched into placement group.""" 33 | global enforce_placement_group_val 34 | enforce_placement_group_val = True 35 | 36 | 37 | def unenforce_placement_group(): 38 | """Enforces all tasks to be launched into placement group.""" 39 | global enforce_placement_group_val 40 | enforce_placement_group_val = False 41 | 42 | 43 | def is_enforced_placement_group(): 44 | return enforce_placement_group_val 45 | 46 | 47 | def auto_assign_task_name_if_needed(name, instance_type='', image_name='', 48 | tasks=1): 49 | global task_counter 50 | if name: 51 | return name 52 | 53 | main_script = os.path.abspath(sys.argv[0]) 54 | script_id = util.alphanumeric_hash( 55 | f"{main_script}-{instance_type}-{image_name}-{tasks}") 56 | name = f"unnamedtask-{task_counter}-{script_id}" 57 | task_counter += 1 58 | return name 59 | 60 | 61 | def auto_assign_job_name_if_needed(name): 62 | global job_counter 63 | if name: 64 | return name 65 | script_id = util.alphanumeric_hash(sys.argv[0]) 66 | name = f"unnamedjob-{job_counter}-{script_id}" 67 | job_counter += 1 68 | return name 69 | 70 | 71 | def auto_assign_run_name_if_needed(name): 72 | global run_counter 73 | if name: 74 | return name 75 | script_id = util.alphanumeric_hash(sys.argv[0]) 76 | name = f"unnamedrun-{run_counter}-{script_id}" 77 | run_counter += 1 78 | return name 79 | 80 | 81 | def register_task(task: Any, run_name: str): 82 | global task_run_dict, run_task_dict, tasks_seen 83 | assert task.name not in tasks_seen 84 | tasks_seen.append(task.name) 85 | task_run_dict[task] = run_name 86 | run_task_list = run_task_dict.get(run_name, []) 87 | run_task_list.append(task) 88 | 89 | # disable check because it's useless (instance creation fails with missing placement group before getting to register_task) 90 | # enforce uniformity -- either all tasks in a run are reused (assuming 1 job per run) or all tasks are created fresh 91 | # has_reuse = sum(task.instance_reuse for task in run_task_list) 92 | # has_fresh = sum(not task.instance_reuse for task in run_task_list) 93 | # if has_reuse + has_fresh != 1: 94 | # tasks_to_kill = [task.name for task in run_task_list] 95 | # print(f"Fatal: trying to reuse some instances while recreating others. Launching a group requires launching all " 96 | # f"instances together. Kill following instances and try again: {','.join(tasks_to_kill)}") 97 | # for task in run_task_list: 98 | # print(f"{task.name}: {'reused' if task.instance_reuse else 'fresh'}") 99 | # os.kill(os.getpid(), signal.SIGTERM) # sys.exit() doesn't work inside thread 100 | 101 | 102 | def register_run(run: "backend.Run", run_name: str) -> None: 103 | print(f"Registering run {run_name}") 104 | assert run_name not in run_dict 105 | assert run_name # empty name reserved to mean no run 106 | run_dict[run_name] = run 107 | 108 | 109 | def is_chief(task: "backend.Task", run_name: str): 110 | """Returns True if task is chief task in the corresponding run""" 111 | global run_task_dict 112 | if run_name not in run_task_dict: 113 | return True 114 | task_list = run_task_dict[run_name] 115 | assert task in task_list, f"Task {task.name} doesn't belong to run {run_name}" 116 | return task_list[0] == task 117 | 118 | 119 | def get_chief(run_name: str): 120 | assert run_name in run_task_dict, f"Run {run_name} doesn't exist" 121 | tasks = run_task_dict[run_name] 122 | assert tasks, f"Run {run_name} had tasks {tasks}, expected non-empty list" 123 | return tasks[0] 124 | 125 | 126 | def get_logdir(run_name: str): 127 | """Returns logdir for this run. It is the job of logdir creator to set logdir for this run""" 128 | 129 | if not run_name: 130 | return '/tmp' 131 | return run_logdir_dict.get(run_name, '') 132 | 133 | 134 | def set_logdir(run_name, logdir): 135 | assert run_name not in run_logdir_dict, f"logdir for run {run_name} has already been set to {run_logdir_dict[run_name]}, trying to change it to {logdir} is illegal" 136 | run_logdir_dict[run_name] = logdir 137 | 138 | 139 | def get_run_for_task(task: "backend.Task") -> str: 140 | """Gets run name associated with given Task""" 141 | return task_run_dict.get(task, '') 142 | 143 | 144 | def get_run_object(run_name: str) -> "backend.Run": 145 | return run_dict.get(run_name, None) 146 | 147 | 148 | def create_run_if_needed(run_name, run_creation_callback) -> "backend.Run": 149 | if run_name in run_dict: 150 | return run_dict[run_name] 151 | run = run_creation_callback(run_name) 152 | return run 153 | 154 | 155 | _should_skip_setup = False 156 | 157 | 158 | def set_should_skip_setup(val): 159 | global _should_skip_setup 160 | if val: 161 | util.log("skipping setup for all subsequent tasks/jobs") 162 | _should_skip_setup = val 163 | 164 | 165 | def should_skip_setup(): 166 | return _should_skip_setup 167 | -------------------------------------------------------------------------------- /ncluster/old_backend.py: -------------------------------------------------------------------------------- 1 | """Interface for job launching backend. 2 | 3 | Run/Job and Task are container classes encapsulating functionality. 4 | User creates them through make_run/make_job/make_task methods 5 | 6 | """ 7 | # Job launcher Python API: https://docs.google.com/document/d/1yTkb4IPJXOUaEWksQPCH7q0sjqHgBf3f70cWzfoFboc/edit 8 | # AWS job launcher (concepts): https://docs.google.com/document/d/1IbVn8_ckfVO3Z9gIiE0b9K3UrBRRiO9HYZvXSkPXGuw/edit 9 | import threading 10 | import time 11 | from typing import List, Tuple, Any, Optional 12 | 13 | from . import util 14 | 15 | # aws_backend.py 16 | # local_backend.py 17 | 18 | LOGDIR_ROOT: Optional[str] = None # location of logdir for this backend 19 | 20 | """ 21 | backend = aws_backend # alternatively, backend=tmux_backend to launch jobs locally in separate tmux sessions 22 | run = backend.make_run("helloworld") # sets up /efs/runs/helloworld 23 | worker_job = run.make_job("worker", instance_type="g3.4xlarge", num_tasks=4, ami=ami, setup_script=setup_script) 24 | ps_job = run.make_job("ps", instance_type="c5.xlarge", num_tasks=4, ami=ami, setup_script=setup_script) 25 | setup_tf_config(worker_job, ps_job) 26 | ps_job.run("python cifar10_main.py --num_gpus=0") # runs command on each task 27 | worker_job.run("python cifar10_main.py --num_gpus=4") 28 | 29 | tb_job = run.make_job("tb", instance_type="m4.xlarge", num_tasks=1, public_port=6006) 30 | tb_job.run("tensorboard --logdir=%s --port=%d" %(run.logdir, 6006)) 31 | # when job has one task, job.task[0].ip can be accessed as job.ip 32 | print("See TensorBoard progress on %s:%d" %(tb_job.ip, 6006)) 33 | print("To interact with workers: %s" %(worker_job.connect_instructions)) 34 | 35 | 36 | To reconnect to existing job: 37 | 38 | """ 39 | 40 | 41 | class Task: 42 | name: str 43 | ip: Optional[str] 44 | public_ip: Optional[str] 45 | run_counter: int 46 | # location where temporary files from interfacing with task go locally 47 | local_scratch: Optional[str] 48 | # location where temporary files from interfacing with task go on task 49 | remote_scratch: Optional[str] 50 | job: Any # can't declare Job because of circular dependency 51 | 52 | def __init__(self, name=''): 53 | """Wraps execution resources into a task. Runs install_script if present""" 54 | self.last_status = None 55 | self.name = name 56 | self.instance = None 57 | self.install_script = None 58 | self.job = None 59 | self.kwargs = None 60 | self.public_ip = None 61 | self.ip = None 62 | self.logdir_ = None 63 | 64 | @property 65 | def logdir(self): 66 | raise NotImplementedError() 67 | 68 | def run(self, cmd: str, non_blocking=False, ignore_errors=False): 69 | """Runs command on given task.""" 70 | raise NotImplementedError() 71 | 72 | # TODO: reuse regular run 73 | def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \ 74 | Tuple[str, str]: 75 | """ 76 | 77 | Args: 78 | cmd: single line shell command to run 79 | non_blocking (bool): if True, does not wait for command to finish 80 | ignore_errors: if True, will succeed even if command failed 81 | 82 | Returns: 83 | Contents of stdout/stderr as strings. 84 | Raises 85 | RuntimeException: if command produced non-0 returncode 86 | 87 | """ 88 | 89 | assert '\n' not in cmd, "Do not support multi-line commands" 90 | cmd: str = cmd.strip() 91 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 92 | return '', '' 93 | 94 | stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout" 95 | stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr" 96 | cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}" 97 | 98 | assert not non_blocking, "Getting output doesn't work with non_blocking" 99 | status = self.run(cmd2, False, ignore_errors=True) 100 | stdout = self.read(stdout_fn) 101 | stderr = self.read(stderr_fn) 102 | 103 | if self.last_status > 0: 104 | self.log(f"Warning: command '{cmd}' returned {status}," 105 | f" stdout was '{stdout}' stderr was '{stderr}'") 106 | if not ignore_errors: 107 | raise RuntimeError(f"Warning: command '{cmd}' returned {status}," 108 | f" stdout was '{stdout}' stderr was '{stderr}'") 109 | 110 | return stdout, stderr 111 | 112 | def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365, 113 | check_interval: float = 0.02) -> bool: 114 | """ 115 | Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec 116 | Args: 117 | fn: filename on task machine 118 | max_wait_sec: how long to wait in seconds 119 | check_interval: how often to check in seconds 120 | Returns: 121 | False if waiting was was cut short by max_wait_sec limit, True otherwise 122 | """ 123 | # print("Waiting for file", fn) 124 | start_time = time.time() 125 | while True: 126 | if time.time() - start_time > max_wait_sec: 127 | util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}") 128 | return False 129 | if not self.exists(fn): 130 | time.sleep(check_interval) 131 | continue 132 | else: 133 | break 134 | return True 135 | 136 | def _run_raw(self, cmd): 137 | """Runs command directly on every task in the job, skipping tmux interface. Use if want to create/manage additional tmux sessions manually.""" 138 | raise NotImplementedError() 139 | 140 | def upload(self, local_fn: str, remote_fn: str = '', 141 | dont_overwrite: bool = False): 142 | """Uploads given file to the task. If remote_fn is not specified, dumps it 143 | into task current directory with the same name. 144 | 145 | Args: 146 | local_fn: location of file locally 147 | remote_fn: location of file on task 148 | dont_overwrite: if True, will be no-op if target file exists 149 | """ 150 | raise NotImplementedError() 151 | 152 | def download(self, remote_fn: str, local_fn: str = ''): 153 | """Downloads remote file to current directory.""" 154 | raise NotImplementedError() 155 | 156 | def write(self, fn, contents): 157 | """Write string contents to file fn in task.""" 158 | raise NotImplementedError() 159 | 160 | def read(self, fn): 161 | """Read contents of file and return it as string.""" 162 | raise NotImplementedError() 163 | 164 | def exists(self, fn) -> bool: 165 | """Checks if fn exists on task 166 | 167 | Args: 168 | fn: filename local to task 169 | Returns: 170 | true if fn exists on task machine 171 | """ 172 | raise NotImplementedError() 173 | 174 | def log(self, message, *args): 175 | """Log to launcher console.""" 176 | if args: 177 | message %= args 178 | 179 | print(f"{util.current_timestamp()} {self.name}: {message}") 180 | 181 | 182 | class Job: 183 | name: str 184 | tasks: List[Task] 185 | 186 | # run_: Run 187 | 188 | def __init__(self, name: str, tasks: List[Task] = None, **kwargs): 189 | """Initializes Job object, links tasks to refer back to the Job.""" 190 | if tasks is None: 191 | tasks = [] 192 | self.name = name 193 | self.tasks = tasks 194 | self.kwargs = kwargs 195 | # TODO: maybe backlinking is not needed 196 | for task in tasks: 197 | task.job = self 198 | 199 | @property 200 | def logdir(self): 201 | return self.tasks[0].logdir 202 | 203 | def _task_parallel(self, method, *args, **kwargs): 204 | """Runs given method on every task in the job in parallel. Blocks until all tasks finish. Propagates exception from first 205 | failed task.""" 206 | 207 | exceptions = [] 208 | 209 | def task_run(task): 210 | try: 211 | getattr(task, method)(*args, **kwargs) 212 | except Exception as e: 213 | exceptions.append(e) 214 | 215 | threads = [threading.Thread(name=f'task_{method}_{i}', 216 | target=task_run, args=[t]) 217 | for i, t in enumerate(self.tasks)] 218 | for thread in threads: 219 | thread.start() 220 | for thread in threads: 221 | thread.join() 222 | if exceptions: 223 | raise exceptions[0] 224 | 225 | def run(self, *args, **kwargs): 226 | """Runs command on every task in the job in parallel, blocks until all tasks finish. 227 | See Task for documentation of args/kwargs.""" 228 | return self._task_parallel("run", *args, **kwargs) 229 | 230 | def propagate_env(self, *args, **kwargs): 231 | """See py:func:`aws_backend.Task.propagate_env`""" 232 | return self._task_parallel("propagate_env", *args, **kwargs) 233 | 234 | def run_with_output(self, *args, **kwargs): 235 | """Runs command on every task in the job in parallel, blocks until all tasks finish. 236 | See Task for documentation of args/kwargs.""" 237 | return self._task_parallel("run_with_output", *args, **kwargs) 238 | 239 | def rsync(self, *args, **kwargs): 240 | """See :py:func:`backend.Task.rsync`""" 241 | return self._task_parallel("rsync", *args, **kwargs) 242 | 243 | def upload(self, *args, **kwargs): 244 | """See :py:func:`backend.Task.upload`""" 245 | return self._task_parallel("upload", *args, **kwargs) 246 | 247 | def write(self, *args, **kwargs): 248 | return self._task_parallel("write", *args, **kwargs) 249 | 250 | def _run_raw(self, *args, **kwargs): 251 | return self._task_parallel("_run_raw", *args, **kwargs) 252 | 253 | 254 | # Implementation needs to be backend specific so that run.create_job calls backend-specific method 255 | class Run: 256 | """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter 257 | server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and 258 | event files. 259 | :ivar aws_placement_group_name: somedoc 260 | """ 261 | jobs: List[Job] 262 | 263 | @property 264 | def logdir(self): 265 | raise NotImplementedError() 266 | 267 | # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods 268 | def run(self, *args, **kwargs): 269 | raise NotImplementedError() 270 | 271 | def run_with_output(self, *args, **kwargs): 272 | raise NotImplementedError() 273 | 274 | def _run_raw(self, *args, **kwargs): 275 | raise NotImplementedError() 276 | 277 | def upload(self, *args, **kwargs): 278 | raise NotImplementedError() 279 | 280 | def make_job(self, name='', **kwargs): 281 | raise NotImplementedError() 282 | 283 | 284 | def make_task(**_kwargs): 285 | raise NotImplementedError() 286 | 287 | 288 | def make_job(**_kwargs): 289 | raise NotImplementedError() 290 | 291 | 292 | def make_run(**_kwargs): 293 | raise NotImplementedError() 294 | -------------------------------------------------------------------------------- /ncluster/summary.txt: -------------------------------------------------------------------------------- 1 | tf_two_machines -- 500 on t3, 910 on c3 2 | 3 | -------------------------------------------------------------------------------- /ncluster/test.py: -------------------------------------------------------------------------------- 1 | 2 | print("%20s" % ('asdfasdf',)) 3 | print(f"{'asdfasdf':>20}") 4 | 5 | print("%5.2f" % (5.5,)) 6 | print(f"{5.5:5.2f}") 7 | -------------------------------------------------------------------------------- /ncluster/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various helper utilities used internally by ncluster project, that are not explicitly tied to AWS 3 | """ 4 | 5 | import os 6 | import random 7 | import string 8 | import sys 9 | import time 10 | from collections import Iterable 11 | import shlex 12 | 13 | from typing import Optional, Tuple 14 | 15 | import portalocker 16 | import paramiko 17 | 18 | # starting value for now_micros (Aug 31, 2018), using this to make various timestamped names shorter 19 | EPOCH_MICROS = 1535753974788163 20 | 21 | 22 | # Whitelist of temporary settings customizable through env vars. These are work-arounds for issues that are 23 | # don't have permanent solutions yet. Keep this small to avoid many moving parts. 24 | env_settings = { 25 | 'NCLUSTER_AUTHORIZED_KEYS', # public keys used to authorize ssh access on all instances 26 | 'NCLUSTER_AWS_FAST_ROOTDISK', # request $1/hour high performance AWS disk 27 | 'NCLUSTER_AWS_PLACEMENT_GROUP', # name of placement group to use, use when adding machines to a previous launched job 28 | 'NCLUSTER_DISABLE_PDB_HANDLER', # don't intercept pdb exception by default 29 | 'NCLUSTER_RUNNING_UNDER_CIRCLECI', # special settings for non-interactive CircleCI integration test env 30 | # 'NCLUSTER_IMAGE', 31 | 'NCLUSTER_SSH_USERNAME', # used as workaround when Amazon Linux detection fails 32 | 'NCLUSTER_ZONE', # zone spec for when automatic zone fails (p3dn's + spot instances) 33 | 'NCLUSTER_AWS_FORCE_CREATE_RESOURCES', # AWS resources are created ignoring automatic existence checks 34 | } 35 | 36 | 37 | # keep this here instead of aws_backend because it's used by its dependency aws_util 38 | VALID_REGIONS = ['us-east-2', 39 | 'us-east-1', 40 | 'us-west-1', # An error occurred (Unsupported) when calling the RunInstances operation 41 | 'us-west-2', 42 | 'ap-east-1', # doesn't have ec2 43 | 'ap-south-1', # no EFS 44 | 'ap-northeast-3', # An error occurred (OptInRequired) when calling the DescribeVpcs operation 45 | 'ap-northeast-2', 46 | 'ap-southeast-1', 47 | 'ap-southeast-2', 48 | 'ap-northeast-1', 49 | 'ca-central-1', 50 | 'cn-north-1', # account number 51 | 'cn-northwest-1', # account number 52 | 'eu-central-1', 53 | 'eu-west-1', 54 | 'eu-west-2', 55 | 'eu-west-3', # no EFS 56 | 'eu-north-1', # no EFS 57 | 'sa-east-1', # no EFS 58 | 'us-gov-east-1', # not authorized 59 | 'us-gov-west-1', # not authorized 60 | ] 61 | 62 | # print/validate custom settings 63 | for v in os.environ: 64 | if v.startswith('NCLUSTER'): 65 | assert v in env_settings, f"Custom setting '{v}'='{os.environ[v]}' not in settings whitelist, if you" \ 66 | f"are sure you need this setting, add it to the env_settings in {os.path.basename(__file__)}, otherwise 'unset {v}'" 67 | # the following are often set by default, so don't print them 68 | if v in {'NCLUSTER_AUTHORIZED_KEYS', 'NCLUSTER_ZONE'}: 69 | continue 70 | 71 | sys.stderr.write(f"ncluster env setting {v}={os.environ[v]}\n") 72 | 73 | 74 | def is_iterable(k): 75 | return isinstance(k, Iterable) 76 | 77 | 78 | def now_micros(absolute=False) -> int: 79 | """Return current micros since epoch as integer.""" 80 | micros = int(time.time() * 1e6) 81 | if absolute: 82 | return micros 83 | return micros - EPOCH_MICROS 84 | 85 | 86 | def now_millis(absolute=False) -> int: 87 | """Return current millis since epoch as integer.""" 88 | millis = int(time.time() * 1e3) 89 | if absolute: 90 | return millis 91 | return millis - EPOCH_MICROS // 1000 92 | 93 | 94 | def current_timestamp() -> str: 95 | # timestamp format from https://github.com/tensorflow/tensorflow/blob/155b45698a40a12d4fef4701275ecce07c3bb01a/tensorflow/core/platform/default/logging.cc#L80 96 | current_seconds = time.time() 97 | remainder_micros = int(1e6 * (current_seconds - int(current_seconds))) 98 | time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_seconds)) 99 | full_time_str = "%s.%06d" % (time_str, remainder_micros) 100 | return full_time_str 101 | 102 | 103 | def log_error(*args, **kwargs): 104 | print(f"Error encountered {args} {kwargs}") 105 | 106 | 107 | def log(*args, **kwargs): 108 | print(f"{args} {kwargs}") 109 | 110 | 111 | def install_pdb_handler(): 112 | """Automatically start pdb: 113 | 1. CTRL+\\ breaks into pdb. 114 | 2. pdb gets launched on exception. 115 | """ 116 | 117 | import signal 118 | import pdb 119 | 120 | def handler(_signum, _frame): 121 | pdb.set_trace() 122 | signal.signal(signal.SIGQUIT, handler) 123 | 124 | # Drop into PDB on exception 125 | # from https://stackoverflow.com/questions/13174412 126 | def info(type_, value, tb): 127 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 128 | # we are in interactive mode or we don't have a tty-like 129 | # device, so we call the default hook 130 | sys.__excepthook__(type_, value, tb) 131 | else: 132 | import traceback 133 | import pdb 134 | # we are NOT in interactive mode, print the exception... 135 | traceback.print_exception(type_, value, tb) 136 | print() 137 | # ...then start the debugger in post-mortem mode. 138 | pdb.pm() 139 | 140 | sys.excepthook = info 141 | 142 | 143 | def shell_add_echo(script): 144 | """Goes over each line script, adds "echo cmd" in front of each cmd. 145 | 146 | ls a 147 | 148 | becomes 149 | 150 | echo * ls a 151 | ls a 152 | """ 153 | new_script = "" 154 | for cmd in script.split('\n'): 155 | cmd = cmd.strip() 156 | if not cmd: 157 | continue 158 | new_script += "echo \\* " + shlex.quote(cmd) + "\n" 159 | new_script += cmd + "\n" 160 | return new_script 161 | 162 | 163 | def shell_strip_comment(cmd): 164 | """ hi # testing => hi""" 165 | if '#' in cmd: 166 | return cmd.split('#', 1)[0] 167 | else: 168 | return cmd 169 | 170 | 171 | def random_id(k=5): 172 | """Random id to use for AWS identifiers.""" 173 | # https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python 174 | return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k)) 175 | 176 | 177 | def alphanumeric_hash(s: str, size=5): 178 | """Short alphanumeric string derived from hash of given string""" 179 | import hashlib 180 | import base64 181 | hash_object = hashlib.md5(s.encode('ascii')) 182 | s = base64.b32encode(hash_object.digest()) 183 | result = s[:size].decode('ascii').lower() 184 | return result 185 | 186 | 187 | def reverse_taskname(name: str) -> str: 188 | """ 189 | Reverses components in the name of task. Reversed convention is used for filenames since 190 | it groups log/scratch files of related tasks together 191 | 192 | 0.somejob.somerun -> somerun.somejob.0 193 | 0.somejob -> somejob.0 194 | somename -> somename 195 | 196 | Args: 197 | name: name of task 198 | 199 | """ 200 | components = name.split('.') 201 | assert len(components) <= 3 202 | return '.'.join(components[::-1]) 203 | 204 | 205 | def is_bash_builtin(cmd): 206 | """Return true if command is invoking bash built-in 207 | """ 208 | # from compgen -b 209 | bash_builtins = ['alias', 'bg', 'bind', 'alias', 'bg', 'bind', 'break', 210 | 'builtin', 'caller', 'cd', 'command', 'compgen', 'complete', 211 | 'compopt', 'continue', 'declare', 'dirs', 'disown', 'echo', 212 | 'enable', 'eval', 'exec', 'exit', 'export', 'false', 'fc', 213 | 'fg', 'getopts', 'hash', 'help', 'history', 'jobs', 'kill', 214 | 'let', 'local', 'logout', 'mapfile', 'popd', 'printf', 215 | 'pushd', 'pwd', 'read', 'readarray', 'readonly', 'return', 216 | 'set', 'shift', 'shopt', 'source', 'suspend', 'test', 217 | 'times', 'trap', 'true', 'type', 'typeset', 'ulimit', 218 | 'umask', 'unalias', 'unset', 'wait'] 219 | toks = cmd.split() 220 | if toks and toks[0] in bash_builtins: 221 | return True 222 | return False 223 | 224 | 225 | def is_set(name: str) -> bool: 226 | """Helper method to check if given property is set""" 227 | assert name in env_settings 228 | 229 | val = os.environ.get(name, '0') 230 | return not (val == '0') 231 | 232 | 233 | def get_env(name: str) -> Optional[str]: 234 | """Helper method to retrieve custom env setting, returns None if not set""" 235 | assert name in env_settings 236 | return os.environ.get(name, None) 237 | 238 | 239 | def set_env(name: str, value: str) -> None: 240 | """Helper method to set custom env setting""" 241 | assert name in env_settings 242 | os.environ[name] = value 243 | 244 | 245 | def assert_script_in_current_directory(): 246 | """Assert fail if current directory is different from location of the script""" 247 | 248 | script = sys.argv[0] 249 | assert os.path.abspath(os.path.dirname(script)) == os.path.abspath( 250 | '.'), f"Change into directory of script {script} and run again." 251 | 252 | 253 | def validate_ncluster_job_name(name): 254 | assert name.count( 255 | '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for convention)" 256 | 257 | 258 | def toseconds(dt) -> float: 259 | """Converts datetime object to seconds.""" 260 | return time.mktime(dt.utctimetuple()) 261 | 262 | 263 | def wait_for_file(fn: str, max_wait_sec: int = 60, 264 | check_interval: float = 1) -> bool: 265 | """ 266 | Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec 267 | Args: 268 | fn: filename 269 | max_wait_sec: how long to wait in seconds 270 | check_interval: how often to check in seconds 271 | Returns: 272 | False if waiting was was cut short by max_wait_sec limit, True otherwise 273 | """ 274 | log("Waiting for file", fn) 275 | start_time = time.time() 276 | while True: 277 | if time.time() - start_time > max_wait_sec: 278 | log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}") 279 | return False 280 | if not os.path.exists(fn): 281 | time.sleep(check_interval) 282 | continue 283 | else: 284 | break 285 | return True 286 | 287 | 288 | # locations of default keypair 289 | ID_RSA = os.environ['HOME'] + '/.ssh/id_rsa' 290 | ID_RSA_PUB = ID_RSA + '.pub' 291 | 292 | 293 | def setup_local_ssh_keys() -> str: 294 | """Sanity checks on local ssh keypair and regenerate it if necessary. Returns location of public keypair file""" 295 | 296 | if os.path.exists(ID_RSA_PUB): 297 | assert os.path.exists(ID_RSA), f"Public key {ID_RSA_PUB} exists but private key {ID_RSA} not found, delete {ID_RSA_PUB} and run again to regenerate pair" 298 | log(f"Found local keypair {ID_RSA}") 299 | elif os.path.exists(ID_RSA): 300 | if not os.path.exists(ID_RSA_PUB): 301 | if is_set('NCLUSTER_RUNNING_UNDER_CIRCLECI'): 302 | pass 303 | else: 304 | assert os.path.exists(ID_RSA_PUB), f"Private key {ID_RSA} exists but public key {ID_RSA_PUB} not found, delete {ID_RSA} and run again to regenerate pair" 305 | log(f"Found local keypair {ID_RSA}") 306 | else: 307 | log(f"Generating keypair {ID_RSA}") 308 | with portalocker.Lock(ID_RSA+'.lock', timeout=5) as _: 309 | os.system(f"ssh-keygen -t rsa -f {ID_RSA} -N ''") 310 | os.system(f'rm {ID_RSA}.lock') 311 | 312 | return ID_RSA_PUB 313 | 314 | 315 | def get_authorized_keys() -> str: 316 | """Appends local public key to NCLUSTER_AUTHORIZED_KEYS and returns in format key1;key2;key3; 317 | The result can be assigned back to NCLUSTER_AUTHORIZED_KEYS env var""" 318 | 319 | assert os.path.exists(ID_RSA_PUB), f"{ID_RSA_PUB} not found, make sure to run 'ncluster keys'" 320 | 321 | current_key = open(ID_RSA_PUB).read().strip() 322 | auth_keys = os.environ.get('NCLUSTER_AUTHORIZED_KEYS', '') 323 | return auth_keys+';'+current_key+';' 324 | 325 | 326 | def get_public_key() -> str: 327 | """Returns public key, creating it if needed""" 328 | 329 | if not os.path.exists(ID_RSA_PUB): 330 | print(f"{ID_RSA_PUB} not found, running sure to run setup_local_ssh_keys()") 331 | setup_local_ssh_keys() 332 | assert os.path.exists(ID_RSA_PUB) 333 | 334 | return open(ID_RSA_PUB).read().strip() 335 | 336 | 337 | def exec_command(ssh: paramiko.SSHClient, command: str, bufsize=-1, timeout=None, get_pty=False, environment=None) -> Tuple[paramiko.ChannelFile, paramiko.ChannelFile, paramiko.ChannelFile, paramiko.Channel]: 338 | """Copy of paramiko's exec_command which also returns the channel.""" 339 | 340 | transport: paramiko.Transport = ssh.get_transport() 341 | chan: paramiko.Channel = transport.open_session(timeout=timeout) 342 | if get_pty: 343 | chan.get_pty() 344 | chan.settimeout(timeout) 345 | if environment: 346 | chan.update_environment(environment) 347 | chan.exec_command(command) 348 | stdin: paramiko.ChannelFile = chan.makefile("wb", bufsize) 349 | stdout: paramiko.ChannelFile = chan.makefile("r", bufsize) 350 | stderr: paramiko.ChannelFile = chan.makefile_stderr("r", bufsize) 351 | return stdin, stdout, stderr, chan 352 | 353 | 354 | class timeit: 355 | """Decorator to measure length of time spent in the block in millis and log 356 | it to TensorBoard.""" 357 | 358 | def __init__(self, tag=""): 359 | self.tag = tag 360 | 361 | def __enter__(self): 362 | self.start = time.perf_counter() 363 | return self 364 | 365 | def __exit__(self, *args): 366 | self.end = time.perf_counter() 367 | interval_ms = 1000 * (self.end - self.start) 368 | print(f'timeit({self.tag}): {interval_ms})') 369 | 370 | 371 | # no_op method/object that accept every signature 372 | class NoOp: 373 | def __getattr__(self, *args): 374 | def no_op(*_args, **_kwargs): pass 375 | return no_op 376 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botocore 2 | boto3>=1.9.159 # needed for InterfaceType in create_instances 3 | boto3_type_annotations # for dev use boto3_type_annotations_with_docs 4 | # cryptography==2.4.2 # workaround for https://github.com/paramiko/paramiko/issues/1369 5 | cryptography 6 | paramiko 7 | portalocker 8 | portpicker 9 | pytz 10 | wandb -------------------------------------------------------------------------------- /requirements_benchmarks.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | numpy 3 | torch 4 | ray 5 | -------------------------------------------------------------------------------- /requirements_test.txt: -------------------------------------------------------------------------------- 1 | wrapt 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = ncluster 3 | author = Yaroslav Bulatov, Andrew Shaw, Ben Mann 4 | author_email = yaroslavvb@gmail.com 5 | description= Lightweight interface to launching jobs in the cloud 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | license_file = LICENSE 9 | url = https://github.com/yaroslavvb/ncluster 10 | classifiers = 11 | Programming Language :: Python :: 3 12 | License :: OSI Approved :: MIT License 13 | Operating System :: OS Independent 14 | 15 | [options] 16 | python_requires = >= 3.6 17 | setup_requires = 18 | setuptools >= 38.6 19 | pip >= 10 20 | twine >= 1.11 21 | packages = find: 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import re 3 | 4 | requirements = [] 5 | for line in open('requirements.txt'): 6 | req = line.split('#', 1)[0] # strip comments 7 | requirements.append(req.strip()) 8 | 9 | # follow https://stackoverflow.com/a/7071358/419116 10 | VERSIONFILE = "ncluster/_version.py" 11 | verstrline = open(VERSIONFILE, "rt").read() 12 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" 13 | mo = re.search(VSRE, verstrline, re.M) 14 | if mo: 15 | verstr = mo.group(1) 16 | else: 17 | raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,)) 18 | 19 | setup(scripts=['ncluster/ncluster_cloud_setup.py', # also used as module 20 | 'ncluster/ncluster_cloud_wipe.py', 21 | 'tools/nsync', 22 | 'tools/ncluster'], 23 | install_requires=requirements, 24 | version=verstr, 25 | ) 26 | -------------------------------------------------------------------------------- /tests/integration_test.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import random 5 | import string 6 | import sys 7 | 8 | import wandb 9 | 10 | # in test environments disable pdb intercept 11 | os.environ['NCLUSTER_DISABLE_PDB_HANDLER'] = '1' 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--name', type=str, default='integration_test', help="job name") 15 | parser.add_argument('--instance_type', type=str, default="c5.large") 16 | parser.add_argument('--num_tasks', type=int, default=2) 17 | parser.add_argument('--image_name', type=str, default='Deep Learning AMI (Ubuntu) Version 23.0') 18 | parser.add_argument('--spot', action='store_true', 19 | help='use spot instead of regular instances') 20 | 21 | parser.add_argument('--nproc_per_node', type=int, default=1) 22 | parser.add_argument('--conda_env', type=str, default='pytorch_p36') 23 | 24 | parser.add_argument('--skip_setup', action='store_true') 25 | parser.add_argument('--local_rank', default=0, type=int) 26 | 27 | 28 | parser.add_argument('--role', type=str, default='launcher', 29 | help='internal flag, launcher or worker') 30 | args = parser.parse_args() 31 | 32 | 33 | def random_id(k=5): 34 | """Random id to use for AWS identifiers.""" 35 | # https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python 36 | return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k)) 37 | 38 | 39 | def launcher(): 40 | # run this test out of root directory of ncluster to capture .git and requirements.txt 41 | script_fn = 'tests/integration_test.py' 42 | 43 | import ncluster 44 | job = ncluster.make_job(**vars(args)) 45 | job.rsync('.') 46 | job.run('pip install -r requirements.txt') 47 | task0 = job.tasks[0] 48 | 49 | task0.run(f'python {script_fn} --role=worker --name={args.name}-{random_id()} --local_rank=0', stream_output=True) 50 | 51 | 52 | def main(): 53 | if args.role == "launcher": 54 | launcher() 55 | elif args.role == "worker": 56 | # rank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', 0)) # ompi way 57 | # rank = int(os.environ.get('RANK', '0')) # pytorch way 58 | rank = args.local_rank # cmd args way 59 | 60 | if rank != 0: 61 | os.environ['WANDB_MODE'] = 'dryrun' # all wandb.log are no-op 62 | wandb.init(project='ncluster', name=args.name, entity='circleci') 63 | print(f"{os.uname()[1]} {rank} {' '.join(sys.argv)}") 64 | sys.stdout.flush() 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /tests/join_test.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import pytest 3 | 4 | def test(): 5 | task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE) 6 | task.run("mkdir /illegal", non_blocking=True) 7 | task.join(ignore_errors=True) # this succeed/print error message 8 | 9 | task.run("mkdir /illegal", non_blocking=True) 10 | with pytest.raises(RuntimeError): 11 | task.join() # this should fail 12 | 13 | if __name__ == '__main__': 14 | test() 15 | -------------------------------------------------------------------------------- /tests/logdir_test.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # tests to make sure that logdir logic works 3 | import inspect 4 | import random 5 | import sys 6 | import threading 7 | 8 | import ncluster 9 | 10 | 11 | def test_two_jobs(): 12 | run = ncluster.make_run('logdir_test') 13 | job1 = run.make_job('job1') 14 | task1 = job1.tasks[0] 15 | task1.run(f'echo hello > {task1.logdir}/message') 16 | job2 = run.make_job('job2') 17 | task2 = job2.tasks[0] 18 | assert task2.read(f'{task2.logdir}/message').strip() == 'hello' 19 | 20 | 21 | def test_multiple_logdirs(): 22 | logdir1 = ncluster.get_logdir_root() + '/test1' 23 | dummy_task = ncluster.make_task() 24 | dummy_task.run(f'rm -Rf {logdir1}') 25 | task1 = ncluster.make_task(run_name='test1') 26 | assert task1.logdir == logdir1 27 | 28 | logdir2 = ncluster.get_logdir_root() + '/test2' 29 | task2 = ncluster.make_task(run_name='test2') 30 | dummy_task.run(f'rm -Rf {logdir2}*') 31 | dummy_task.run(f'mkdir {logdir2}') 32 | assert task2.logdir == logdir2 + '.01' 33 | 34 | 35 | def test_multiple_logdir_tasks(): 36 | n = 10 37 | dummy_task = ncluster.make_task() 38 | logdir1 = ncluster.get_logdir_root() + '/test1' 39 | dummy_task.run(f'rm -Rf {logdir1}') 40 | job = ncluster.make_job(run_name='test1', num_tasks=n) 41 | 42 | obtained_logdirs = [] 43 | 44 | import wrapt 45 | 46 | @wrapt.synchronized 47 | def query(i): 48 | obtained_logdirs.append(job.tasks[i].logdir) 49 | 50 | threads = [threading.Thread(target=query, args=(i,)) for i in range(n)] 51 | for thread in reversed(threads): 52 | thread.start() 53 | 54 | random.shuffle(threads) 55 | for thread in threads: 56 | thread.join() 57 | 58 | assert len(set(obtained_logdirs)) == 1 59 | assert obtained_logdirs[0] == logdir1 60 | 61 | 62 | def run_all_tests(module): 63 | all_functions = inspect.getmembers(module, inspect.isfunction) 64 | for name, func in all_functions: 65 | if name.startswith('test'): 66 | print("Testing " + name) 67 | func() 68 | print(module.__name__ + " tests passed.") 69 | 70 | 71 | def manual(): 72 | run_all_tests(sys.modules[__name__]) 73 | 74 | 75 | if __name__ == '__main__': 76 | manual() 77 | -------------------------------------------------------------------------------- /tests/many_commands_test.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import ncluster.util as util 3 | 4 | # Test for a fix to exception with too many concurrent connections ("paramiko.ssh_exception.ChannelException: (1, 'Administratively prohibited')) 5 | 6 | 7 | 8 | def test(): 9 | task = ncluster.make_task('test2') 10 | for i in range(20): 11 | task.run('ls', stream_output=True) 12 | 13 | 14 | if __name__ == '__main__': 15 | test() 16 | -------------------------------------------------------------------------------- /tests/run_test.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | 3 | def test(): 4 | run = ncluster.make_run('run_test') 5 | job1 = run.make_job('job1') 6 | task1 = job1.tasks[0] 7 | assert task1.name == '0.job1.run_test' 8 | task1.run(f'echo task1sayshello > {task1.logdir}/message') 9 | job2 = run.make_job('job2') 10 | task2 = job2.tasks[0] 11 | assert task2.name == '0.job2.run_test' 12 | assert task2.read(f'{task2.logdir}/message').strip() == 'task1sayshello' 13 | 14 | 15 | if __name__ == '__main__': 16 | test() 17 | -------------------------------------------------------------------------------- /tools/ncluster: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # tool to automate various AWS commands 3 | import datetime as dt 4 | import os 5 | import shlex 6 | import subprocess 7 | import sys 8 | import time 9 | from typing import Dict 10 | 11 | import pytz 12 | 13 | import ncluster 14 | 15 | from ncluster import aws_util as u 16 | from ncluster import util 17 | from ncluster.aws_backend import INSTANCE_INFO 18 | from boto3_type_annotations.ec2 import Volume 19 | from boto3_type_annotations.ec2 import Image 20 | 21 | VERBOSE = False 22 | 23 | 24 | def _run_shell(user_cmd): 25 | """Runs shell command, returns list of outputted lines 26 | with newlines stripped""" 27 | # print(cmd) 28 | p = subprocess.Popen(user_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 29 | (stdout, _) = p.communicate() 30 | stdout = stdout.decode('ascii') if stdout else '' 31 | lines = stdout.split('\n') 32 | stripped_lines = [] 33 | for l in lines: 34 | stripped_line = l.strip() 35 | if l: 36 | stripped_lines.append(stripped_line) 37 | return stripped_lines 38 | 39 | 40 | def _check_instance_found(instances, fragment, states=()): 41 | if not instances: 42 | if states: 43 | print(f"Couldn't find instances in state {states} matching '{fragment}' for key {u.get_keypair_name()}") 44 | else: 45 | print(f"Couldn't find instances matching '{fragment}' for key {u.get_keypair_name()}") 46 | return False 47 | return True 48 | 49 | 50 | def vprint(*args): 51 | if VERBOSE: 52 | print(*args) 53 | 54 | 55 | def toseconds(dt_): 56 | """Converts datetime object to seconds.""" 57 | return time.mktime(dt_.utctimetuple()) 58 | 59 | 60 | def ls(fragment=''): 61 | """List running instances""" 62 | print(f"https://console.aws.amazon.com/ec2/v2/home?region={u.get_region()}") 63 | 64 | stopped_instances = u.lookup_instances(fragment, states=['stopped']) 65 | stopped_names = list(u.get_name(i) for i in stopped_instances) 66 | if stopped_names: 67 | print("ignored stopped instances: ", ", ".join(stopped_names)) 68 | 69 | instances = u.lookup_instances(fragment) 70 | print('-' * 80) 71 | print( 72 | f"{'name':18s} {'hours_live':>10s} {'cost_in_$':>10s} {'instance_type':>15s} {'public_ip':>15s} " 73 | f"{'key/owner':>15s} {'private ip':>15s}") 74 | print('-' * 80) 75 | for instance in instances[::-1]: 76 | # current time in UTC zone (default AWS) 77 | now_time = dt.datetime.utcnow().replace(tzinfo=pytz.utc) 78 | launch_time = instance.launch_time 79 | elapsed_sec = toseconds(now_time) - toseconds(launch_time) 80 | elapsed_hours = elapsed_sec / 3600 81 | instance_type = instance.instance_type 82 | if instance_type in INSTANCE_INFO: 83 | cost = INSTANCE_INFO[instance_type]['cost'] * elapsed_hours 84 | else: 85 | cost = -1 86 | key_name = str(instance.key_name) # could be None 87 | print(f"{u.get_name(instance):18s} {elapsed_sec / 3600:10.1f} {cost:10.0f} {instance_type[:5]:>15s} " 88 | f"{instance.public_ip_address:>15s} {key_name[9:]:>15s} {instance.private_ip_address:>15s} " 89 | f"{instance.placement_group.name} ") 90 | 91 | # list spot requests, ignore active ones since they show up already 92 | client = u.get_ec2_client() 93 | spot_requests = [] 94 | for request in client.describe_spot_instance_requests()['SpotInstanceRequests']: 95 | state = request['State'] 96 | # TODO(y) also ignore state == 'fulfilled'? 97 | if state == 'cancelled' or state == 'closed' or state == 'active': 98 | continue 99 | 100 | launch_spec = request['LaunchSpecification'] 101 | spot_requests.append(launch_spec['InstanceType']) 102 | if spot_requests: 103 | print(f"Pending spot instances: {','.join(spot_requests)}") 104 | # client.cancel_spot_instance_requests(SpotInstanceRequestIds=[request['SpotInstanceRequestId']]) 105 | 106 | 107 | def etchosts(_): 108 | """Copy/pastable /etc/hosts file""" 109 | instances = u.lookup_instances() 110 | instance_tuples = [(u.get_name(i), i.public_ip_address) for i in instances] 111 | print('-' * 80) 112 | print("paste following into your /etc/hosts") 113 | print('-' * 80) 114 | for name, ip in sorted(instance_tuples): 115 | print(f"{ip} {name}") 116 | 117 | print("""\n127.0.0.1 localhost 118 | 255.255.255.255 broadcasthost 119 | ::1 localhost""") 120 | 121 | 122 | def _user_keypair_check(instance): 123 | launching_user = instance.key_name[len(u.get_prefix()) + 1:] 124 | current_user = os.environ['USER'] 125 | assert launching_user == current_user, f"Set USER={launching_user} to connect to this machine, and make sure their " \ 126 | f".pem file is in your ~/.ncluster" 127 | 128 | 129 | def ssh(fragment=''): 130 | """SSH into the instace with the given prefix.""" 131 | instances = u.lookup_instances(fragment) 132 | if not _check_instance_found(instances, fragment): 133 | return 134 | instance = instances[0] 135 | if len(instances) > 1: 136 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)} " 137 | f"launched by {instance.key_name}") 138 | else: 139 | print(f"Connecting to {u.get_name(instance)} " 140 | f"launched by {instance.key_name}") 141 | 142 | _user_keypair_check(instance) 143 | user_cmd = f"ssh -t -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no -o ServerAliveCountMax=1 " \ 144 | f"-o ServerAliveInterval=60 " \ 145 | f"{u.get_aws_username(instance)}@{instance.public_ip_address} " 146 | print(user_cmd) 147 | os.system(user_cmd) 148 | 149 | 150 | def reboot(fragment=''): 151 | """reboots given instance.""" 152 | instances = u.lookup_instances(fragment) 153 | if not _check_instance_found(instances, fragment): 154 | return 155 | instance = instances[0] 156 | if len(instances) > 1: 157 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)} " 158 | f"launched by {instance.key_name}") 159 | else: 160 | print(f"Rebooting to {u.get_name(instance)} ({instance.id})" 161 | f"launched by {instance.key_name}") 162 | 163 | _user_keypair_check(instance) 164 | instance.reboot() 165 | 166 | 167 | def old_ssh(fragment=''): 168 | """SSH into the instace with the given prefix. Works on dumb terminals.""" 169 | instances = u.lookup_instances(fragment) 170 | if not _check_instance_found(instances, fragment): 171 | return 172 | instance = instances[0] 173 | if len(instances) > 1: 174 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)} " 175 | f"launched by {instance.key_name}") 176 | else: 177 | print(f"Connecting to {u.get_name(instance)} " 178 | f"launched by {instance.key_name}") 179 | 180 | _user_keypair_check(instance) 181 | user_cmd = f"ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no -o ConnectTimeout=10 " \ 182 | f"-o ServerAliveCountMax=1 " \ 183 | f"-o ServerAliveInterval=60 " \ 184 | f"{u.get_aws_username(instance)}@{instance.public_ip_address}" 185 | print(user_cmd) 186 | os.system(user_cmd) 187 | 188 | 189 | def connect(fragment=''): 190 | """SSH into the instance using authorized keys mechanism.""" 191 | instances = u.lookup_instances(fragment) 192 | if not _check_instance_found(instances, fragment): 193 | return 194 | instance = instances[0] 195 | if len(instances) > 1: 196 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)} " 197 | f"launched by {instance.key_name}") 198 | else: 199 | print(f"Connecting to {u.get_name(instance)} " 200 | f"launched by {instance.key_name}") 201 | 202 | ssh_cmd = f"ssh -t -o StrictHostKeyChecking=no -o ConnectTimeout=10 " \ 203 | f"-o ServerAliveCountMax=1 " \ 204 | f"-o ServerAliveInterval=60 " \ 205 | f"{u.get_aws_username(instance)}@{instance.public_ip_address} " 206 | connect_cmd = ssh_cmd 207 | do_tmux = False 208 | if 'INSIDE_EMACS' in os.environ: 209 | print("detected Emacs, skipping tmux attach") 210 | elif os.environ.get('TERM', 'dumb') == 'dumb': 211 | print("Dumb terminal, doesn't support tmux, skipping tmux attach") 212 | elif 'NO_TMUX' in os.environ: 213 | print("detected NO_TMUX, skipping tmux attach") 214 | else: 215 | do_tmux = True 216 | connect_cmd += " tmux a" 217 | 218 | print(connect_cmd) 219 | exit_code = os.system(connect_cmd) 220 | 221 | if exit_code != 0 and do_tmux: 222 | fix_cmd = ssh_cmd + " tmux new" 223 | print(f"Creating ssh tmux a returned {exit_code}, recreate tmux using '{fix_cmd}'") 224 | return 225 | else: 226 | print(f"cmd {connect_cmd} returned {exit_code}") 227 | 228 | 229 | def connectm(fragment=''): 230 | """Like connect, but uses mosh""" 231 | instances = u.lookup_instances(fragment) 232 | if not _check_instance_found(instances, fragment): 233 | return 234 | instance = instances[0] 235 | if len(instances) > 1: 236 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)} " 237 | f"launched by {instance.key_name}") 238 | else: 239 | print(f"Connecting to {u.get_name(instance)} " 240 | f"launched by {instance.key_name}") 241 | 242 | user_cmd = f"mosh --ssh='ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " \ 243 | f"-o ServerAliveCountMax=1 " \ 244 | f"-o ServerAliveInterval=60' " \ 245 | f"{u.get_aws_username(instance)}@{instance.public_ip_address}" 246 | print(user_cmd) 247 | os.system(user_cmd) 248 | 249 | 250 | def kill(fragment: str = '', stop_instead_of_kill: bool = False): 251 | """ 252 | 253 | Args: 254 | fragment: 255 | stop_instead_of_kill: use stop_instances instead of terminate_instances 256 | """ 257 | 258 | if stop_instead_of_kill: 259 | states = ['running'] 260 | else: 261 | states = ['running', 'stopped'] 262 | instances = u.lookup_instances(fragment, states=states, limit_to_current_user=False) 263 | instances_to_kill = [] 264 | instances_to_skip = [] 265 | users_to_skip = set() 266 | instances_to_kill_formatted = [] 267 | for i in instances: 268 | state = i.state['Name'] 269 | 270 | if not LIMIT_TO_CURRENT_USER or i.key_name == u.get_keypair_name(): 271 | instances_to_kill_formatted.append((" ", u.get_name(i), i.instance_type, i.key_name, state if state == 'stopped' else '')) 272 | instances_to_kill.append(i) 273 | else: 274 | instances_to_skip.append(u.get_name(i)) 275 | users_to_skip.add(i.key_name[9:]) 276 | 277 | if stop_instead_of_kill: 278 | action = 'stopping' 279 | override_action = 'reallystop' 280 | else: 281 | action = 'terminating' 282 | override_action = 'reallykill' 283 | 284 | if instances_to_skip: 285 | print(f"Skipping {','.join(instances_to_skip)} launched by ({', '.join(users_to_skip)}), override with {override_action}") 286 | if not _check_instance_found(instances_to_kill, fragment, states): 287 | return 288 | 289 | print(f"{action}:") 290 | for line in instances_to_kill_formatted: 291 | print(*line) 292 | 293 | ec2_client = u.get_ec2_client() 294 | # don't ask for confirmation when stopping, erronous stopping has milder consequences 295 | num_instances = len(instances_to_kill) 296 | # if util.is_set("NCLUSTER_SKIP_CONFIRMATION") or stop_instead_of_kill: 297 | # print("NCLUSTER_SKIP_CONFIRMATION is set or stop_instead_of_kill, skipping confirmation") 298 | answer = input(f"{num_instances} instances found, {action} in {u.get_region()}? (y/N) ") 299 | 300 | if answer.lower() == "y": 301 | instance_ids = [i.id for i in instances_to_kill] 302 | 303 | if stop_instead_of_kill: 304 | response = ec2_client.stop_instances(InstanceIds=instance_ids) 305 | else: 306 | response = ec2_client.terminate_instances(InstanceIds=instance_ids) 307 | 308 | assert u.is_good_response(response), response 309 | print(f"{action} {num_instances} instances: success") 310 | else: 311 | print("Didn't get y, doing nothing") 312 | 313 | 314 | def stop(fragment=''): 315 | kill(fragment, stop_instead_of_kill=True) 316 | 317 | 318 | LIMIT_TO_CURRENT_USER = True 319 | 320 | 321 | def reallykill(*args, **kwargs): 322 | """Kill instances, including ones launched by other users.""" 323 | global LIMIT_TO_CURRENT_USER 324 | LIMIT_TO_CURRENT_USER = False 325 | kill(*args, **kwargs) 326 | LIMIT_TO_CURRENT_USER = True 327 | 328 | 329 | def reallystop(*args, **kwargs): 330 | """Stop instances, including ones launched by other users.""" 331 | global LIMIT_TO_CURRENT_USER 332 | LIMIT_TO_CURRENT_USER = False 333 | stop(*args, **kwargs) 334 | LIMIT_TO_CURRENT_USER = True 335 | 336 | 337 | def start(fragment=''): 338 | instances = u.lookup_instances(fragment, states=['stopped']) 339 | for i in instances: 340 | print(u.get_name(i), i.instance_type, i.key_name) 341 | 342 | if not instances: 343 | print("no stopped instances found, quitting") 344 | return 345 | 346 | # answer = input(f"{len(instances)} instances found, start in {u.get_region()}? (y/N) ") 347 | answer = 'y' 348 | 349 | if answer.lower() == "y": 350 | for i in instances: 351 | print(f"starting {u.get_name(i)}") 352 | i.start() 353 | else: 354 | print("Didn't get y, doing nothing") 355 | return 356 | 357 | print("Warning, need to manually mount efs on instance: ") 358 | print_efs_mount_command() 359 | 360 | 361 | def mosh(fragment=''): 362 | instances = u.lookup_instances(fragment) 363 | if not _check_instance_found(instances, fragment): 364 | return 365 | instance = instances[0] 366 | print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent {u.get_name(instance)}") 367 | _user_keypair_check(instance) 368 | 369 | user_cmd = f"mosh --ssh='ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no' " \ 370 | f"{u.get_aws_username(instance)}@{instance.public_ip_address}" # tmux attach" 371 | print(user_cmd) 372 | os.system(user_cmd) 373 | 374 | 375 | def print_efs_mount_command(): 376 | print(u.get_efs_mount_command()) 377 | 378 | 379 | def efs(_): 380 | print("EFS information. To upload to remote EFS use 'ncluster efs_sync'") 381 | print_efs_mount_command() 382 | print() 383 | print() 384 | 385 | efs_client = u.get_efs_client() 386 | response = efs_client.describe_file_systems() 387 | assert u.is_good_response(response), response 388 | 389 | for efs_response in response['FileSystems']: 390 | # {'CreationTime': datetime.datetime(2017, 12, 19, 10, 3, 44, tzinfo=tzlocal()), 391 | # 'CreationToken': '1513706624330134', 392 | # 'Encrypted': False, 393 | # 'FileSystemId': 'fs-0f95ab46', 394 | # 'LifeCycleState': 'available', 395 | # 'Name': 'nexus01', 396 | # 'NumberOfMountTargets': 0, 397 | # 'OwnerId': '316880547378', 398 | # 'PerformanceMode': 'generalPurpose', 399 | # 'SizeInBytes': {'Value': 6144}}, 400 | efs_id = efs_response['FileSystemId'] 401 | tags_response = efs_client.describe_tags(FileSystemId=efs_id) 402 | assert u.is_good_response(tags_response) 403 | key = u.get_name(tags_response.get('Tags', '')) 404 | print("%-16s %-16s" % (efs_id, key)) 405 | print('-' * 40) 406 | 407 | # list mount points 408 | response = efs_client.describe_mount_targets(FileSystemId=efs_id) 409 | ec2 = u.get_ec2_resource() 410 | if not response['MountTargets']: 411 | print("") 412 | else: 413 | for mount_response in response['MountTargets']: 414 | subnet = ec2.Subnet(mount_response['SubnetId']) 415 | zone = subnet.availability_zone 416 | state = mount_response['LifeCycleState'] 417 | id_ = mount_response['MountTargetId'] 418 | ip = mount_response['IpAddress'] 419 | print('%-16s %-16s %-16s %-16s' % (zone, ip, id_, state,)) 420 | 421 | 422 | def terminate_tmux(_): 423 | """Script to clean-up tmux sessions.""" 424 | 425 | for line in _run_shell('tmux ls'): 426 | session_name = line.split(':', 1)[0] 427 | 428 | if session_name == 'tensorboard' or session_name == 'jupyter' or session_name == 'dropbox': 429 | print("Skipping " + session_name) 430 | continue 431 | print("Killing " + session_name) 432 | _run_shell('tmux kill-session -t ' + session_name) 433 | 434 | 435 | def nano(*_unused_args): 436 | """Bring up t2.nano instance.""" 437 | ncluster.make_task(name='shell', 438 | instance_type='t2.nano') 439 | 440 | 441 | def cmd(user_cmd): 442 | """Finds most recent instance launched by user, runs commands there, pipes output to stdout""" 443 | 444 | instances = u.lookup_instances(limit_to_current_user=True) 445 | assert instances, f"{u.get_username()} doesn't have an instances to connect to. Use 'ncluster nano'" \ 446 | f" to bring up a small instance." 447 | instance = instances[0] 448 | user_cmd = f"ssh -t -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no " \ 449 | f"{u.get_aws_username(instance)}@{instance.public_ip_address} {user_cmd}" 450 | os.system(user_cmd) 451 | 452 | 453 | def cat(user_cmd): cmd('cat ' + user_cmd) 454 | 455 | 456 | def ls_(user_cmd): cmd('ls ' + user_cmd) 457 | 458 | 459 | def cleanup_placement_groups(*_args): 460 | print("Deleting all placement groups") 461 | # TODO(y): don't delete groups that have currently stopped instances 462 | client = u.get_ec2_client() 463 | for group in client.describe_placement_groups().get('PlacementGroups', []): 464 | name = group['GroupName'] 465 | sys.stdout.write(f"Deleting {name} ... ") 466 | sys.stdout.flush() 467 | try: 468 | client.delete_placement_group(GroupName=name) 469 | print("success") 470 | except Exception as _: 471 | print("failed") 472 | 473 | 474 | # ncluster launch --image_name=dlami23-efa --instance_type=c5.large --name=test 475 | def launch(args_str): 476 | import argparse 477 | parser = argparse.ArgumentParser() 478 | parser.add_argument('--name', type=str, default='ncluster_launch', help="instance name") 479 | # parser.add_argument('--image_name', type=str, default='') # default small image 480 | parser.add_argument('--image_name', type=str, default='Deep Learning AMI (Ubuntu) Version 23.0') 481 | # can also use --image_name='Deep Learning AMI (Amazon Linux) Version 23.0' # cybertronai01 482 | parser.add_argument('--instance_type', type=str, default='c5.large', help="type of instance") 483 | parser.add_argument('--disk_size', type=int, default=0, help="size of disk in GBs. If 0, use default size for the image") 484 | args = parser.parse_args(shlex.split(args_str)) 485 | 486 | return ncluster.make_task(**vars(args)) 487 | 488 | 489 | def fix_default_security_group(_): 490 | """Allows ncluster and ncluster_nd security groups to exchange traffic with each other.""" 491 | 492 | def peer(current, other): 493 | """allow current group to accept all traffic from other group""" 494 | 495 | groups = u.get_security_group_dict() 496 | current_group = groups[current] 497 | other_group = groups[other] 498 | response = {} 499 | for protocol in ['icmp']: 500 | try: 501 | rule = {'FromPort': -1, 502 | 'IpProtocol': protocol, 503 | 'IpRanges': [], 504 | 'PrefixListIds': [], 505 | 'ToPort': -1, 506 | 'UserIdGroupPairs': [{'GroupId': other_group.id}]} 507 | response = current_group.authorize_ingress(IpPermissions=[rule]) 508 | 509 | except Exception as e: 510 | if response['Error']['Code'] == 'InvalidPermission.Duplicate': 511 | print("Warning, got " + str(e)) 512 | else: 513 | assert False, "Failed while authorizing ingress with " + str(e) 514 | 515 | for protocol in ['tcp', 'udp']: 516 | try: 517 | rule = {'FromPort': 0, 518 | 'IpProtocol': protocol, 519 | 'IpRanges': [], 520 | 'PrefixListIds': [], 521 | 'ToPort': 65535, 522 | 'UserIdGroupPairs': [{'GroupId': other_group.id}]} 523 | response = current_group.authorize_ingress(IpPermissions=[rule]) 524 | except Exception as e: 525 | if response['Error']['Code'] == 'InvalidPermission.Duplicate': 526 | print("Warning, got " + str(e)) 527 | else: 528 | assert False, "Failed while authorizing ingress with " + str(e) 529 | 530 | group1 = u.get_security_group_name() 531 | group2 = u.get_security_group_nd_name() 532 | 533 | peer(group1, group2) 534 | peer(group2, group1) 535 | 536 | 537 | def keys(_): 538 | """runs ssh-keygen if necessary, prints public key.""" 539 | key = util.get_public_key() 540 | print("Your public key is below. Append all of your your team-members public keys to NCLUSTER_AUTHORIZED_KEYS env var separated by ; ie \nNCLUSTER_AUTHORIZED_KEYS=;;\n") 541 | print(key) 542 | 543 | 544 | def lookup_image(image_id: str) -> Image: 545 | """Looks up image from image id like 'ami-0cc96feef8c6bbff3', prints image.name""" 546 | # could use ec2.images.filter(ImageIds=['ami-0cc96feef8c6bbff3'] 547 | assert image_id.startswith('ami-') 548 | ec2 = u.get_ec2_resource() 549 | images = list(ec2.images.filter(ImageIds=[image_id])) 550 | assert images, f"No images found with id={image_id}" 551 | assert len(images) == 1, f"Multiple images found with id={image_id}: {','.join(i.name for i in images)}" 552 | image = [im for im in images if im.id == image_id][0] 553 | print(image.name) 554 | return image 555 | 556 | 557 | def grow_disks(fragment: str, target_size_gb=500): 558 | """Grows main disk for given machine to 500GB""" 559 | 560 | instance = u.lookup_instance(fragment) 561 | client = u.get_ec2_client() 562 | 563 | volumes = list(instance.volumes.all()) 564 | for vol in volumes: 565 | volume: Volume = vol 566 | if volume.size < target_size_gb: 567 | print("Growing %s to %s" % (volume.id, target_size_gb)) 568 | response = client.modify_volume(VolumeId=volume.id, Size=target_size_gb) 569 | assert u.is_good_response(response) 570 | else: 571 | print(f"Volume {volume.id} is already {volume.size} GB's, skipping") 572 | 573 | 574 | def disks(fragment): 575 | """Print disk information for instance.""" 576 | instances = u.lookup_instances(fragment, states=('running', 'stopped')) 577 | print(f"{'device':>10s} {'size':>10s} {'type':>10s}({'iops'}) {'id':>30s} ") 578 | print("-" * 50) 579 | for instance in instances: 580 | print() 581 | print(f"Disks on instance '{u.get_name(instance)}' ({instance.id}, {instance.placement['AvailabilityZone']})") 582 | for volume in instance.volumes.all(): 583 | device = volume.attachments[0]['Device'] 584 | print(f"{device:>10s} {volume.size:>10d} {volume.volume_type:>10s}({volume.iops}) {volume.id:>30s} {u.get_name(volume)}") 585 | print("Unattached disks") 586 | 587 | ec2 = u.get_ec2_resource() 588 | for volume in ec2.volumes.all(): 589 | if not volume.attachments: 590 | print(f"{u.get_name(volume)} {volume.size:>10d} {volume.volume_type:>10s} {volume.id:>30s}") 591 | 592 | 593 | def fixkeys(_): 594 | key_name = u.get_keypair_name() 595 | pairs = u.get_keypair_dict() 596 | if key_name not in pairs: 597 | print(f"Default keypair {key_name} does not exist, returning") 598 | return 599 | keypair = pairs[key_name] 600 | 601 | print(f"Deleting current user keypair {key_name}") 602 | ec2 = u.get_ec2_resource() 603 | instance_list = [] 604 | for instance in ec2.instances.all(): 605 | if instance.state == 'terminated': 606 | continue 607 | instance_list.append(instance) 608 | if instance_list: 609 | print("Warning, after deleting keypair, the following instances will be no longer accessible:") 610 | for i in instance_list: 611 | print(u.get_name(i), i.id) 612 | answer = input("Proceed? (y/N) ") 613 | else: 614 | answer = "y" 615 | if answer.lower() == 'y': 616 | keypair_fn = u.get_keypair_fn() 617 | if os.path.exists(keypair_fn): 618 | print(f"Deleting local .pem file '{keypair_fn}'") 619 | os.system(f'sudo rm -f {keypair_fn}') 620 | print(f"Deleting AWS keypair '{keypair.name}'") 621 | keypair.delete() 622 | 623 | 624 | def efs_sync(_): 625 | """Starts a daemon to sync local /ncluster/sync with remote ncluster sync.""" 626 | 627 | print("Syncing local /ncluster/sync to remote /ncluster/sync") 628 | if not os.path.exists('/ncluster/sync'): 629 | print("Local /ncluster/sync doesn't exist, creating") 630 | if not os.path.exists('/ncluster'): 631 | os.system('sudo mkdir /ncluster') 632 | os.system('chown `whoami` /ncluster') 633 | os.system('mkdir /ncluster/sync') 634 | 635 | instances = u.lookup_instances(limit_to_current_user=True) 636 | if instances: 637 | instance = instances[0] 638 | print(f"Found {len(instances)} instances owned by {u.get_username()}, using {u.get_name(instance)} for syncing") 639 | else: 640 | print(f"Found no instances by {u.get_username()}, Launching t2.nano instance to do the sync.") 641 | task = ncluster.make_task(name='shell', instance_type='t2.nano') 642 | instance = task.instance 643 | 644 | os.system(f'cd /ncluster/sync && nsync -m {u.get_name(instance)} -d /ncluster/sync') 645 | 646 | 647 | def spot_prices(instance_type: str) -> Dict[str, float]: 648 | """ 649 | 650 | Print spot instance pricing. 651 | Args: 652 | instance_type: AWS instance name. Common names can be shortcuts (p2, p3, c5) 653 | 654 | Returns: 655 | dictionary of zone->price for given instance 656 | """ 657 | if instance_type == 'p3': 658 | instance_type = 'p3.16xlarge' 659 | elif instance_type == 'p3dn': 660 | instance_type = 'p3dn.24xlarge' 661 | elif instance_type == 'p2': 662 | instance_type = 'p2.16xlarge' 663 | elif instance_type == 'c5': 664 | instance_type = 'c5.18xlarge' 665 | elif instance_type == 'c5n': 666 | instance_type = 'c5n.18xlarge' 667 | else: 668 | # expecting exact match 669 | pass 670 | product = 'Linux/UNIX (Amazon VPC)' 671 | result = {} 672 | client = u.get_ec2_client() 673 | print(f"Prices for instance {instance_type} on Linux") 674 | for zone in [z['ZoneName'] for z in client.describe_availability_zones()['AvailabilityZones'] if z['State'] == 'available']: 675 | try: 676 | price = client.describe_spot_price_history(InstanceTypes=[instance_type], 677 | MaxResults=1, 678 | ProductDescriptions=[product], 679 | AvailabilityZone=zone)['SpotPriceHistory'][0]['SpotPrice'] 680 | price = float(price) 681 | print(f"{zone}: {price:.2f}") 682 | result[zone] = price 683 | except IndexError as _: 684 | pass 685 | return result 686 | 687 | 688 | COMMANDS = { 689 | 'ls': ls, 690 | 'ssh': ssh, 691 | 'ssh_': old_ssh, 692 | 'old_ssh': old_ssh, 693 | 'mosh': mosh, 694 | 'kill': kill, 695 | 'reallykill': reallykill, 696 | 'stop': stop, 697 | 'reallystop': reallystop, 698 | 'start': start, 699 | 'efs': efs, 700 | 'cat': cat, 701 | 'ls_': ls_, 702 | 'nano': nano, 703 | 'cmd': cmd, 704 | '/etc/hosts': etchosts, 705 | 'hosts': etchosts, 706 | 'terminate_tmux': terminate_tmux, 707 | 'cleanup_placement_groups': cleanup_placement_groups, 708 | 'lookup_image': lookup_image, 709 | 'lookup_image_name': lookup_image, 710 | 'reboot': reboot, 711 | 'launch': launch, 712 | 'fix_default_security_group': fix_default_security_group, 713 | 'keys': keys, 714 | 'connect': connect, 715 | 'connectm': connectm, 716 | 'grow_disks': grow_disks, 717 | 'disks': disks, 718 | 'fixkeys': fixkeys, 719 | 'efs_sync': efs_sync, 720 | 'spot_prices': spot_prices, 721 | } 722 | 723 | 724 | def main(): 725 | print(f"Region ({u.get_region()}) $USER ({u.get_username()}) account ({u.get_account_number()}:{u.get_account_name()})") 726 | if len(sys.argv) < 2: 727 | mode = 'ls' 728 | else: 729 | mode = sys.argv[1] 730 | if mode == 'help': 731 | for k, v in COMMANDS.items(): 732 | if v.__doc__: 733 | print(f'{k}\t{v.__doc__}') 734 | else: 735 | print(k) 736 | return 737 | if mode not in COMMANDS: 738 | assert False, f"unknown command '{mode}', available commands are {', '.join(str(a) for a in COMMANDS.keys())}" 739 | 740 | # for connect commands, don't escape args in order for exact match to work (ncluster connect `exactname`) 741 | if mode == 'connect' or mode == 'ssh' or mode == 'old_ssh' or mode == 'mosh': 742 | COMMANDS[mode](' '.join(sys.argv[2:])) # no escaping 743 | else: 744 | COMMANDS[mode](' '.join([shlex.quote(arg) for arg in sys.argv[2:]])) 745 | 746 | 747 | if __name__ == '__main__': 748 | main() 749 | -------------------------------------------------------------------------------- /tools/nsync: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Usage: 4 | # 5 | # To sync current directory with ~ on target instance 6 | # sync.py -n ygpubox 7 | # 8 | # 9 | # 10 | # forked from original by gdb@openai 11 | import argparse 12 | import fcntl 13 | import logging 14 | import os 15 | import select 16 | import subprocess 17 | import sys 18 | 19 | from ncluster import aws_util as u 20 | 21 | # In modules, use `logger = logging.getLogger(__name__)` 22 | 23 | parser = argparse.ArgumentParser(description='sync') 24 | parser.add_argument('-v', '--verbose', action='count', dest='verbosity', 25 | default=0, help='Set verbosity.') 26 | parser.add_argument('-m', '--machine', type=str, default='', help="name of machine to sync with") 27 | parser.add_argument('-d', '--directory', type=str, default='.', help="which directory to sync to (default .)") 28 | args = parser.parse_args() 29 | 30 | logger = logging.getLogger() 31 | logger.addHandler(logging.StreamHandler(sys.stderr)) 32 | if args.verbosity == 0: 33 | logger.setLevel(logging.INFO) 34 | elif args.verbosity >= 1: 35 | logger.setLevel(logging.DEBUG) 36 | 37 | 38 | class Error(Exception): 39 | pass 40 | 41 | 42 | class Resyncd(object): 43 | def __init__(self, remote, sync): 44 | self.remote = remote 45 | self.sync = sync 46 | self.counter = 0 47 | 48 | def run(self): 49 | self.resync() 50 | sources = [sync.source for sync in self.sync] 51 | fswatch = subprocess.Popen(['fswatch'] + sources, stdout=subprocess.PIPE) 52 | fl = fcntl.fcntl(fswatch.stdout.fileno(), fcntl.F_GETFL) 53 | fcntl.fcntl(fswatch.stdout.fileno(), fcntl.F_SETFL, fl | os.O_NONBLOCK) 54 | while True: 55 | r, _, _ = select.select([fswatch.stdout], [], []) 56 | fswatch_output = r[0].read() 57 | output = fswatch_output.decode('ascii') 58 | files = output.strip().split("\n") 59 | 60 | # Ignore emacs swap files 61 | files = [f for f in files if '#' not in os.path.basename(f)] 62 | if files: 63 | print("changed: " + str(files)) 64 | files = set(files) # remove duplicates from fswatch_output 65 | if not files: 66 | continue 67 | 68 | print("---") 69 | print(files) 70 | print("---") 71 | self.resync() 72 | 73 | def resync(self): 74 | procs = [] 75 | for sync in self.sync: 76 | instances = u.lookup_instances(args.machine, verbose=False, limit_to_current_user=True) 77 | assert instances, f"Couldn't find instance {args.machine}" 78 | instance = instances[0] 79 | 80 | print("Syncing with ", u.get_name(instance)) 81 | 82 | command = sync.command(instance) 83 | popen = subprocess.Popen(command) 84 | procs.append({ 85 | 'popen': popen, 86 | 'command': command, 87 | }) 88 | # Wait 89 | for proc in procs: 90 | print(proc["command"]) 91 | proc['popen'].communicate() 92 | for proc in procs: 93 | if proc['popen'].returncode != 0: 94 | raise Error('Bad returncode from %s: %d', proc['command'], proc['popen'].returncode) 95 | logger.info('Resync %d complete', self.counter) 96 | self.counter += 1 97 | 98 | 99 | class Sync(object): 100 | # todo: exclude .#sync.py 101 | excludes = ('*.model', '*.cache', '.picklecache', '*.pyc', '*.gz', '.*', '#*') 102 | 103 | def __init__(self, source, dest, modify_window=True, copy_links=False, excludes=()): 104 | self.source = os.path.expanduser(source) 105 | self.dest = dest 106 | self.modify_window = modify_window 107 | self.copy_links = copy_links 108 | self.excludes = self.excludes + excludes 109 | 110 | def command(self, instance): 111 | excludes = [] 112 | for exclude in self.excludes: 113 | excludes += ['--exclude', exclude] 114 | 115 | # todo, rename no_strict_checking to ssh_command 116 | 117 | keypair_fn = u.get_keypair_fn() 118 | username = u.get_aws_username(instance) 119 | ip = instance.public_ip_address 120 | 121 | ssh_command = "ssh -i %s -o StrictHostKeyChecking=no" % (keypair_fn,) 122 | no_strict_checking = ['-arvce', ssh_command] 123 | 124 | command = ['rsync'] + no_strict_checking + excludes 125 | if self.modify_window: 126 | command += ['--update', '--modify-window=600'] 127 | if self.copy_links: 128 | command += ['-L'] 129 | command += ['-rv', self.source, username + "@" + ip + ':' + self.dest] 130 | print("Running ") 131 | print(command) 132 | return command 133 | 134 | 135 | def main(): 136 | sync = [Sync(source='.', dest=args.directory, copy_links=False), ] 137 | 138 | # obtain ssh 139 | resyncd = Resyncd('asdf', sync) 140 | 141 | resyncd.run() 142 | return 0 143 | 144 | 145 | if __name__ == '__main__': 146 | sys.exit(main()) 147 | --------------------------------------------------------------------------------