├── .doctrees ├── .eggs │ ├── requests-2.19.1-py3.6.egg │ │ └── EGG-INFO │ │ │ └── DESCRIPTION.doctree │ └── urllib3-1.23-py3.6.egg │ │ └── EGG-INFO │ │ └── DESCRIPTION.doctree ├── environment.pickle └── index.doctree ├── .gitignore ├── LICENSE ├── README.md ├── benchmarks ├── README.md ├── mpi_two_machines.py ├── pytorch_two_machines.py ├── ray_ps.py ├── ray_two_machines.py ├── ray_two_machines_local.py ├── requirements.txt ├── summary.txt ├── tf_two_machines.py ├── tf_two_machines_local.py └── util.py ├── examples ├── deleteme.py ├── gpubox.py ├── gpubox_jupyter_notebook_config.py ├── gpubox_sample.ipynb ├── launch_16_instances.py ├── ray_example.py ├── requirements.txt ├── simple_job.py ├── simple_task.py ├── simple_tf.py ├── tf_adder.py └── tf_adder_tb.py ├── ncluster ├── __init__.py ├── aws_backend.py ├── aws_create_resources.py ├── aws_delete_resources.py ├── aws_util.py ├── backend.py ├── local_backend.py ├── ncluster.py ├── ncluster_globals.py ├── summary.txt ├── test.py └── util.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── join_test.py ├── logdir_test.py └── run_test.py /.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree -------------------------------------------------------------------------------- /.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree -------------------------------------------------------------------------------- /.doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/environment.pickle -------------------------------------------------------------------------------- /.doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/index.doctree -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dist 2 | /build 3 | /.DS_Store 4 | /ncluster.egg-info 5 | /ncluster/__pycache__ 6 | /.eggs 7 | /ncluster/.idea 8 | /.idea 9 | __pycache__ 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2018] [Yaroslav Bulatov] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ncluster 2 | By Yaroslav Bulatov and Andrew Shaw 3 | 4 | ``` 5 | import ncluster 6 | task = ncluster.make_task(instance_type='p2.xlarge') 7 | task.upload('myscript.py') 8 | task.run('python myscript.py > out') 9 | task.download('out') 10 | ``` 11 | 12 | ## Installation 13 | Install pip, tmux, Python 3.6 (see below), then 14 | 15 | ``` 16 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt 17 | pip install ncluster 18 | ``` 19 | 20 | ### Extra 21 | An example of installing pip/tmux/python 3.6 on MacOS 22 | 23 | 1. Download Anaconda distribution following https://conda.io/docs/user-guide/install/index.html 24 | 2. Install tmux through homebrew: https://brew.sh/, then `brew install tmux` 25 | 26 | Then 27 | 28 | ``` 29 | conda create -n new python=3.6 -y 30 | conda activate new 31 | ``` 32 | 33 | Extra Deps: 34 | ``` 35 | brew install fswatch 36 | ``` 37 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | 3 | ``` 4 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt 5 | pip install ncluster 6 | python 7 | ``` 8 | 9 | 10 | # Debugging 11 | ``` 12 | export NCLUSTER_INSTANCE=c5.18xlarge 13 | export NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE=1 14 | ``` 15 | -------------------------------------------------------------------------------- /benchmarks/mpi_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Running locally 5 | 6 | 004/11 sent 100 MBs in 28.4 ms: 3519.33 MB/second 7 | 005/11 sent 100 MBs in 25.1 ms: 3988.50 MB/second 8 | 006/11 sent 100 MBs in 25.5 ms: 3918.33 MB/second 9 | 007/11 sent 100 MBs in 25.3 ms: 3958.61 MB/second 10 | 008/11 sent 100 MBs in 25.3 ms: 3954.15 MB/second 11 | 009/11 sent 100 MBs in 24.9 ms: 4009.78 MB/second 12 | 010/11 sent 100 MBs in 25.0 ms: 3992.75 MB/second 13 | min: 24.94, median: 25.52, mean: 29.53 14 | 15 | 16 | """ 17 | 18 | import argparse 19 | import json 20 | import os 21 | import numpy as np 22 | import tensorflow as tf 23 | import time 24 | 25 | import util 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 29 | parser.add_argument("--iters", default=11, type=int, 30 | help="Maximum number of additions") 31 | parser.add_argument("--size-mb", default=100, type=int, 32 | help="size of vector in MBs") 33 | parser.add_argument("--shards", default=1, type=int, 34 | help="how many ways to shard the variable") 35 | parser.add_argument('--image', 36 | default='Deep Learning AMI (Ubuntu) Version 15.0') 37 | parser.add_argument('--name', 38 | default='mpi') 39 | 40 | # internal flags 41 | parser.add_argument('--role', default='launcher', type=str) 42 | args = parser.parse_args() 43 | 44 | 45 | def run_launcher(): 46 | import ncluster 47 | if args.aws: 48 | ncluster.set_backend('aws') 49 | 50 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) 51 | job.upload(__file__) 52 | job.upload('util.py') 53 | 54 | # kill python just for when tmux session reuse is on 55 | if not ncluster.running_locally(): 56 | job._run_raw('killall python', ignore_errors=True) 57 | 58 | if ncluster.get_backend() == 'aws': 59 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 60 | job.run('source activate tensorflow_p36') 61 | 62 | 63 | hosts = [task.public_ip for task in job.tasks] 64 | host_str = ','.join(hosts) 65 | os.system(f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') 66 | print(job.tasks[0].read('/tmp/out')) 67 | 68 | 69 | def run_worker(): 70 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 71 | 72 | from mpi4py import MPI 73 | comm = MPI.COMM_WORLD 74 | rank = comm.Get_rank() 75 | 76 | if rank == 0: 77 | log = util.FileLogger('/tmp/out') 78 | # log = util.FileLogger('/dev/null', mirror=False) 79 | 80 | else: 81 | log = util.FileLogger('/dev/null', mirror=False) 82 | grads_array = [] 83 | 84 | time_list = [] 85 | dim = args.size_mb*250*1000 86 | dtype = np.float32 87 | data = np.ones(dim, dtype=dtype)*(rank+1) 88 | for i in range(args.iters): 89 | start_time = time.perf_counter() 90 | if rank == 0: 91 | comm.Send(data, dest=1, tag=13) 92 | else: 93 | data = np.empty(dim, dtype=dtype) 94 | comm.Recv(data, source=0, tag=13) 95 | 96 | end_time = time.perf_counter() 97 | 98 | elapsed_time_ms = (end_time - start_time) * 1000 99 | time_list.append(elapsed_time_ms) 100 | rate = args.size_mb / (elapsed_time_ms / 1000) 101 | log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}' 102 | f' ms: {rate:.2f} MB/second') 103 | 104 | min = np.min(time_list) 105 | median = np.median(time_list) 106 | 107 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 108 | 109 | 110 | def main(): 111 | # run local benchmark in launcher and launch service 112 | if args.role == "launcher": 113 | run_launcher() 114 | elif args.role == "worker": 115 | run_worker() 116 | else: 117 | assert False, 'unknown role' 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /benchmarks/pytorch_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Run locally: 4 | # ./pytorch_p2p.py 5 | # 000/10 added 100 MBs in 35.0 ms: 2854.88 MB/second 6 | # 001/10 added 100 MBs in 25.1 ms: 3979.37 MB/second 7 | # 002/10 added 100 MBs in 25.4 ms: 3935.73 MB/second 8 | # 003/10 added 100 MBs in 24.7 ms: 4040.93 MB/second 9 | # 004/10 added 100 MBs in 24.4 ms: 4097.57 MB/second 10 | # min: 21.58, median: 24.97, mean: 25.61 11 | 12 | # To run on AWS: 13 | # export NCLUSTER_IMAGE='Deep Learning AMI (Ubuntu) Version 15.0' 14 | # export NCLUSTER_INSTANCE=c5.18xlarge 15 | # python pytorch_p2p.py --aws 16 | # 990/1000 added 100 MBs in 83.7 ms: 1194.35 MB/second 17 | # 991/1000 added 100 MBs in 83.4 ms: 1198.78 MB/second 18 | # 992/1000 added 100 MBs in 83.4 ms: 1198.73 MB/second 19 | # 993/1000 added 100 MBs in 83.3 ms: 1201.20 MB/second 20 | # 994/1000 added 100 MBs in 83.1 ms: 1203.84 MB/second 21 | # 995/1000 added 100 MBs in 83.1 ms: 1203.04 MB/second 22 | # 996/1000 added 100 MBs in 83.5 ms: 1197.38 MB/second 23 | # 997/1000 added 100 MBs in 82.4 ms: 1213.99 MB/second 24 | # 998/1000 added 100 MBs in 84.2 ms: 1187.69 MB/second 25 | # 999/1000 added 100 MBs in 83.0 ms: 1204.13 MB/second 26 | # min: 80.52, median: 83.25, mean: 83.29 27 | 28 | import os 29 | import sys 30 | import time 31 | import argparse 32 | import util 33 | 34 | parser = argparse.ArgumentParser(description='launch') 35 | 36 | # launcher flags 37 | parser.add_argument('--name', type=str, default='pytorch_two_machines', 38 | help="name of the current run") 39 | parser.add_argument('--size-mb', type=int, default=100, 40 | help='size of data to send') 41 | parser.add_argument('--iters', type=int, default=10, 42 | help='how many iterations') 43 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 44 | parser.add_argument('--image', 45 | default='Deep Learning AMI (Ubuntu) Version 15.0') 46 | 47 | 48 | # mpi flags 49 | parser.add_argument('--role', type=str, default='launcher', 50 | help='internal flag, launcher or worker') 51 | parser.add_argument('--rank', type=int, default=0, 52 | help='mpi rank') 53 | parser.add_argument('--size', type=int, default=0, 54 | help='size of mpi world') 55 | parser.add_argument('--master-addr', type=str, default='127.0.0.1', 56 | help='address of master node') 57 | parser.add_argument('--master-port', type=int, default=6006, 58 | help='port of master node') 59 | args = parser.parse_args() 60 | 61 | def worker(): 62 | """ Initialize the distributed environment. """ 63 | 64 | import torch 65 | import torch.distributed as dist 66 | from torch.multiprocessing import Process 67 | import numpy as np 68 | 69 | print("Initializing distributed pytorch") 70 | os.environ['MASTER_ADDR'] = str(args.master_addr) 71 | os.environ['MASTER_PORT'] = str(args.master_port) 72 | # Use TCP backend. Gloo needs nightly, where it currently fails with 73 | # dist.init_process_group('gloo', rank=args.rank, 74 | # AttributeError: module 'torch.distributed' has no attribute 'init_process_group' 75 | dist.init_process_group('tcp', rank=args.rank, 76 | world_size=args.size) 77 | 78 | tensor = torch.ones(args.size_mb*250*1000)*(args.rank+1) 79 | time_list = [] 80 | outfile = 'out' if args.rank == 0 else '/dev/null' 81 | log = util.FileLogger(outfile) 82 | for i in range(args.iters): 83 | # print('before: rank ', args.rank, ' has data ', tensor[0]) 84 | 85 | start_time = time.perf_counter() 86 | if args.rank == 0: 87 | dist.send(tensor=tensor, dst=1) 88 | else: 89 | dist.recv(tensor=tensor, src=0) 90 | 91 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 92 | time_list.append(elapsed_time_ms) 93 | # print('after: rank ', args.rank, ' has data ', tensor[0]) 94 | rate = args.size_mb/(elapsed_time_ms/1000) 95 | 96 | log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 97 | 98 | min = np.min(time_list) 99 | median = np.median(time_list) 100 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 101 | 102 | 103 | def launcher(): 104 | import ncluster 105 | 106 | if args.aws: 107 | ncluster.set_backend('aws') 108 | 109 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) 110 | job.upload(__file__) 111 | job.upload('util.py') 112 | 113 | if args.aws: 114 | job.run('source activate pytorch_p36') 115 | else: 116 | job.run('source deactivate') 117 | job.run('source activate ncluster-test3') 118 | 119 | script_name = os.path.basename(__file__) 120 | common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' 121 | job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args, 122 | non_blocking=True) 123 | job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args, 124 | non_blocking=True) 125 | 126 | job.tasks[0].join() 127 | print(job.tasks[0].read('out')) 128 | 129 | 130 | def main(): 131 | if args.role == "launcher": 132 | launcher() 133 | elif args.role == "worker": 134 | worker() 135 | else: 136 | assert False, "Unknown role "+FLAGS.role 137 | 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /benchmarks/ray_ps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Ray parameter server benchmark 4 | # 5 | # python ray_ps.py --aws --num-ps=1 --num-workers=1 --size-mb=100 --iters=100 6 | 7 | # # 1 worker, 1 ps 8 | # min: 61.61, median: 63.77, mean: 69.20 9 | 10 | # # 1 worker, 2 ps 11 | # python ray_ps.py --aws --num-ps=2 --num-workers=1 --size-mb=100 --iters=100 12 | # min: 49.45, median: 50.91, mean: 58.92 13 | 14 | # # 1 worker, 4 ps 15 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=100 --iters=100 16 | # min: 47.98, median: 50.71, mean: 59.05 17 | 18 | # # 4 worker, 4 ps 19 | # python ray_ps.py --aws --num-ps=4 --num-workers=4 --size-mb=100 --iters=100 20 | # 098/100 sent 400 MBs in 238.5 ms: 419.28 MB/second 21 | # 099/100 sent 400 MBs in 242.0 ms: 413.22 MB/second 22 | # min: 219.90, median: 241.51, mean: 245.95 23 | # (54ms per worker since 4x more work done) 24 | 25 | # # 1 worker, 4 ps, larger arrays 26 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=800 --iters=100 27 | # min: 358.35, median: 544.59, mean: 513.47 28 | # 29 | # Bottom line, 50-60ms to send 100MB regardless of sharding/workers 30 | 31 | import argparse 32 | import os 33 | import socket 34 | import subprocess 35 | import time 36 | 37 | import numpy as np 38 | import ray 39 | 40 | import util 41 | 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--role", default='launcher', type=str, 44 | help="launcher/driver") 45 | parser.add_argument('--image', 46 | default='Deep Learning AMI (Ubuntu) Version 15.0') 47 | parser.add_argument("--size-mb", default=10, type=int, 48 | help='how much data to send at each iteration') 49 | parser.add_argument("--num-workers", default=2, type=int) 50 | parser.add_argument("--num-ps", default=2, type=int) 51 | 52 | parser.add_argument("--iters", default=11, type=int) 53 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 54 | parser.add_argument("--xray", default=1, type=int, 55 | help="whether to use XRay backend") 56 | parser.add_argument('--nightly', default=1, type=int, 57 | help='whether to use nightly version') 58 | parser.add_argument('--name', default='ray_ps', type=str, 59 | help='name of the run') 60 | parser.add_argument("--ip", default='', type=str, 61 | help="internal flag, used to point worker to head node") 62 | args = parser.parse_args() 63 | 64 | dim = args.size_mb * 250 * 1000 // args.num_ps 65 | 66 | 67 | @ray.remote(resources={"worker": 1}) 68 | class Worker(object): 69 | def __init__(self): 70 | self.gradients = np.ones(dim, dtype=np.float32) 71 | 72 | @ray.method(num_return_vals=args.num_ps) 73 | def compute_gradients(self): 74 | if args.num_ps == 1: 75 | return self.gradients 76 | return [self.gradients]*args.num_ps 77 | 78 | def ip(self): 79 | return ray.services.get_node_ip_address() 80 | 81 | 82 | @ray.remote(resources={"worker": 1}) 83 | class ParameterServer(object): 84 | def __init__(self): 85 | self.params = np.zeros(dim, dtype=np.float32) 86 | 87 | def receive(self, *grad_list): 88 | for grad in grad_list: 89 | self.params = grad # use = just to get network overhead 90 | return self.params 91 | 92 | def get_weights(self): 93 | return self.params 94 | 95 | def ip(self): 96 | return ray.services.get_node_ip_address() 97 | 98 | 99 | 100 | def run_launcher(): 101 | import ncluster 102 | 103 | if args.aws: 104 | ncluster.set_backend('aws') 105 | 106 | if args.nightly: 107 | # running locally MacOS 108 | if 'Darwin' in util.ossystem('uname') and not args.aws: 109 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 110 | else: 111 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 112 | else: 113 | install_script = 'pip install ray' 114 | 115 | job = ncluster.make_job(name=args.name, 116 | install_script=install_script, 117 | image_name=args.image, 118 | num_tasks=args.num_workers+args.num_ps) 119 | if not ncluster.running_locally(): 120 | job._run_raw('killall python', ignore_errors=True) 121 | 122 | job.upload(__file__) 123 | job.upload('util.py') 124 | if args.xray: 125 | job.run('export RAY_USE_XRAY=1') 126 | job.run('ray stop') 127 | 128 | head = job.tasks[0] 129 | 130 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 131 | worker_resource = """--resources='{"worker": 1}'""" 132 | head.run(f"ray start --head {worker_resource} --redis-port=6379") 133 | 134 | for task in job.tasks[1:]: 135 | task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}") 136 | 137 | head.run(f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}') 138 | 139 | print(head.read('out')) 140 | 141 | 142 | def transpose(list_of_lists): 143 | return list(map(list, zip(*list_of_lists))) 144 | 145 | 146 | def run_driver(): 147 | ray.init(redis_address=args.ip) 148 | 149 | worker_actors = [Worker.remote() for _ in range(args.num_workers)] 150 | ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)] 151 | 152 | log = util.FileLogger('out') 153 | 154 | time_list = [] 155 | for i in range(args.iters): 156 | start_time = time.perf_counter() 157 | grads_list = [] 158 | for actor in worker_actors: 159 | result = actor.compute_gradients.remote() 160 | if args.num_ps == 1: 161 | grads_list.append([result]) 162 | else: 163 | grads_list.append(result) 164 | 165 | updates = [] 166 | for ps, shards in zip(ps_actors, transpose(grads_list)): 167 | updates.append(ps.receive.remote(*shards)) 168 | 169 | ray.wait(updates, num_returns=args.num_ps) 170 | 171 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 172 | time_list.append(elapsed_time_ms) 173 | rate = args.size_mb / (elapsed_time_ms/1000) 174 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb*args.num_workers, elapsed_time_ms, rate)) 175 | 176 | min = np.min(time_list) 177 | median = np.median(time_list) 178 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 179 | 180 | 181 | def main(): 182 | if args.role == 'launcher': 183 | run_launcher() 184 | elif args.role == 'driver': 185 | run_driver() 186 | else: 187 | assert False, f"Unknown role {args.role}, must be laucher/driver" 188 | 189 | 190 | if __name__ == '__main__': 191 | main() 192 | -------------------------------------------------------------------------------- /benchmarks/ray_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Example of two process Ray program, worker sends values to parameter 4 | # server on a different machine 5 | # 6 | # Run locally: 7 | # ./ray_two_machines.py 8 | # 9 | # Run on AWS: 10 | # ./ray_two_machines.py --aws 11 | 12 | 13 | # Example timings 14 | # c5.18xlarge over network: over network: 63.0 ms: 1586.76 MB/second 15 | # c5.9xlarge over network: 399/400 added 100 MBs in 85.5 ms: 1170.26 MB/second 16 | # c5.18xlarge locally: 86 ms, 1218 MB/seconds (9.7 Gbps) 17 | # macbook pro locally: 978.9 ms, 102.15 MB/second 18 | 19 | # c5.18xlarge 20 | # 004/11 sent 100 MBs in 69.4 ms: 1440.31 MB/second 21 | # 005/11 sent 100 MBs in 68.1 ms: 1468.95 MB/second 22 | # 006/11 sent 100 MBs in 70.4 ms: 1421.40 MB/second 23 | # 007/11 sent 100 MBs in 69.5 ms: 1438.62 MB/second 24 | # 008/11 sent 100 MBs in 66.4 ms: 1506.90 MB/second 25 | # 009/11 sent 100 MBs in 76.5 ms: 1306.92 MB/second 26 | # 010/11 sent 100 MBs in 66.8 ms: 1497.64 MB/second 27 | # min: 66.36, median: 69.43, mean: 70.55 28 | 29 | # Another run 30 | # 989/1000 sent 100 MBs in 54.6 ms: 1831.07 MB/second 31 | # 990/1000 sent 100 MBs in 54.4 ms: 1837.20 MB/second 32 | # 991/1000 sent 100 MBs in 54.8 ms: 1824.91 MB/second 33 | # 992/1000 sent 100 MBs in 53.4 ms: 1874.39 MB/second 34 | # 993/1000 sent 100 MBs in 53.1 ms: 1881.77 MB/second 35 | # 994/1000 sent 100 MBs in 52.7 ms: 1897.76 MB/second 36 | # 995/1000 sent 100 MBs in 55.4 ms: 1805.42 MB/second 37 | # 996/1000 sent 100 MBs in 53.4 ms: 1872.93 MB/second 38 | # 997/1000 sent 100 MBs in 52.7 ms: 1896.65 MB/second 39 | # 998/1000 sent 100 MBs in 54.0 ms: 1851.14 MB/second 40 | # 999/1000 sent 100 MBs in 53.6 ms: 1864.93 MB/second 41 | # min: 51.11, median: 55.45, mean: 60.74 42 | 43 | 44 | # Bottom line: 30ms locally, 60ms over network 45 | 46 | import argparse 47 | import os 48 | import socket 49 | import subprocess 50 | import time 51 | 52 | import numpy as np 53 | import ray 54 | 55 | import util 56 | 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--role", default='launcher', type=str, 59 | help="launcher/driver") 60 | parser.add_argument('--image', 61 | default='Deep Learning AMI (Ubuntu) Version 15.0') 62 | parser.add_argument("--size-mb", default=100, type=int, 63 | help='how much data to send at each iteration') 64 | parser.add_argument("--iters", default=11, type=int) 65 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 66 | parser.add_argument("--xray", default=1, type=int, 67 | help="whether to use XRay backend") 68 | parser.add_argument('--nightly', default=1, type=int, 69 | help='whether to use nightly version') 70 | parser.add_argument('--name', default='ray_two_machines', type=str, 71 | help='name of the run') 72 | parser.add_argument("--ip", default='', type=str, 73 | help="internal flag, used to point worker to head node") 74 | args = parser.parse_args() 75 | 76 | dim = args.size_mb * 250 * 1000 77 | 78 | 79 | @ray.remote(resources={"worker": 1}) 80 | class Worker(object): 81 | def __init__(self): 82 | self.gradients = np.ones(dim, dtype=np.float32) 83 | 84 | def compute_gradients(self): 85 | return self.gradients 86 | 87 | def ip(self): 88 | return ray.services.get_node_ip_address() 89 | 90 | 91 | @ray.remote(resources={"ps": 1}) 92 | class ParameterServer(object): 93 | def __init__(self): 94 | self.params = np.zeros(dim, dtype=np.float32) 95 | 96 | def receive(self, grad): 97 | self.params = grad # use = just to get network overhead 98 | return self.params 99 | 100 | def get_weights(self): 101 | return self.params 102 | 103 | def ip(self): 104 | return ray.services.get_node_ip_address() 105 | 106 | 107 | 108 | def run_launcher(): 109 | import ncluster 110 | 111 | if args.aws: 112 | ncluster.set_backend('aws') 113 | 114 | if args.nightly: 115 | # running locally MacOS 116 | print(f"asdfasdf {util.ossystem('uname')}") 117 | if 'Darwin' in util.ossystem('uname') and not args.aws: 118 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 119 | print(f"asdfasdf got install script {install_script}") 120 | else: 121 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 122 | else: 123 | install_script = 'pip install ray' 124 | 125 | job = ncluster.make_job(name=args.name, 126 | install_script=install_script, 127 | image_name=args.image, 128 | num_tasks=2) 129 | ps, worker = job.tasks 130 | if not ncluster.running_locally(): 131 | ps._run_raw('killall python', ignore_errors=True) 132 | worker._run_raw('killall python', ignore_errors=True) 133 | 134 | job.upload(__file__) 135 | job.upload('util.py') 136 | if args.xray: 137 | job.run('export RAY_USE_XRAY=1') 138 | job.run('ray stop') 139 | 140 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 141 | ps_resource = """--resources='{"ps": 1}'""" 142 | worker_resource = """--resources='{"worker": 1}'""" 143 | 144 | ps.run(f"ray start --head {ps_resource} --redis-port=6379") 145 | worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") 146 | worker.run( 147 | f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 148 | print(worker.read('out')) 149 | 150 | 151 | def run_driver(): 152 | ray.init(redis_address=args.ip) 153 | 154 | worker = Worker.remote() 155 | ps = ParameterServer.remote() 156 | log = util.FileLogger('out') 157 | log(f"Worker ip {ray.get(worker.ip.remote())}") 158 | log(f"PS ip {ray.get(ps.ip.remote())}") 159 | log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") 160 | 161 | time_list = [] 162 | for i in range(args.iters): 163 | start_time = time.perf_counter() 164 | grads = worker.compute_gradients.remote() 165 | result = ps.receive.remote(grads) 166 | ray.wait([result]) 167 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 168 | time_list.append(elapsed_time_ms) 169 | rate = args.size_mb / (elapsed_time_ms/1000) 170 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 171 | 172 | min = np.min(time_list) 173 | median = np.median(time_list) 174 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 175 | 176 | 177 | def main(): 178 | if args.role == 'launcher': 179 | run_launcher() 180 | elif args.role == 'driver': 181 | run_driver() 182 | else: 183 | assert False, f"Unknown role {args.role}, must be laucher/driver" 184 | 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /benchmarks/ray_two_machines_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Runs two machine benchmark locally on AWS machine 4 | # 5 | # Example timings 6 | # macbook: added 10 MBs in 14.1 ms: 707.68 MB/second 7 | # c5.18xlarge: added 10 MBs in 4.4 ms: 2298.82 MB/second 8 | # 091/100 added 100 MBs in 30.8 ms: 3246.44 MB/second 9 | 10 | # Bottom line: can do 3.2 GB/second running locally, 800 11 | import argparse 12 | import os 13 | import socket 14 | import subprocess 15 | import time 16 | 17 | import numpy as np 18 | import ray 19 | 20 | import util 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--image', 24 | default='Deep Learning AMI (Ubuntu) Version 15.0') 25 | parser.add_argument("--size-mb", default=100, type=int, 26 | help='how much data to send at each iteration') 27 | parser.add_argument("--iters", default=11, type=int) 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 29 | parser.add_argument("--xray", default=1, type=int, 30 | help="whether to use XRay backend") 31 | parser.add_argument('--nightly', default=1, type=int, 32 | help='whether to use nightly version') 33 | parser.add_argument('--name', default='ray_two_machines', type=str, 34 | help='name of the run') 35 | 36 | parser.add_argument("--ip", default='', type=str, 37 | help="internal flag, used to point worker to head node") 38 | parser.add_argument("--role", default='launcher', type=str, 39 | help="interanl flag, launcher/driver") 40 | args = parser.parse_args() 41 | 42 | dim = args.size_mb * 250 * 1000 43 | 44 | 45 | @ray.remote(resources={"worker": 1}) 46 | class Worker(object): 47 | def __init__(self): 48 | self.gradients = np.ones(dim, dtype=np.float32) 49 | 50 | def compute_gradients(self): 51 | return self.gradients 52 | 53 | def ip(self): 54 | return ray.services.get_node_ip_address() 55 | 56 | 57 | @ray.remote(resources={"ps": 1}) 58 | class ParameterServer(object): 59 | def __init__(self): 60 | self.params = np.zeros(dim, dtype=np.float32) 61 | 62 | def assign_add(self, grad): 63 | self.params = grad # use = just to get network overhead 64 | return self.params 65 | 66 | def get_weights(self): 67 | return self.params 68 | 69 | def ip(self): 70 | return ray.services.get_node_ip_address() 71 | 72 | 73 | 74 | def run_launcher(): 75 | import ncluster 76 | 77 | if args.aws: 78 | ncluster.set_backend('aws') 79 | 80 | if args.nightly: 81 | # running locally MacOS 82 | if 'Darwin' in util.ossystem('uname') and not args.aws: 83 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' 84 | else: 85 | install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' 86 | else: 87 | install_script = 'pip install ray' 88 | 89 | worker = ncluster.make_task(name=args.name, 90 | install_script=install_script, 91 | image_name=args.image) 92 | if not ncluster.running_locally(): 93 | worker._run_raw('killall python', ignore_errors=True) 94 | worker.upload(__file__) 95 | worker.upload('util.py') 96 | if args.xray: 97 | worker.run('export RAY_USE_XRAY=1') 98 | worker.run('ray stop') 99 | 100 | resources = """--resources='{"ps": 1, "worker": 1}'""" 101 | worker.run(f"ray start --head {resources} --redis-port=6379") 102 | # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") 103 | worker.run( 104 | f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 105 | print(worker.read('out')) 106 | 107 | 108 | def run_driver(): 109 | ray.init(redis_address=args.ip) 110 | 111 | worker = Worker.remote() 112 | ps = ParameterServer.remote() 113 | log = util.FileLogger('out') 114 | log(f"Worker ip {ray.get(worker.ip.remote())}") 115 | log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") 116 | 117 | time_list = [] 118 | for i in range(args.iters): 119 | start_time = time.perf_counter() 120 | grads = worker.compute_gradients.remote() 121 | result = ps.assign_add.remote(grads) 122 | result = ray.get(result)[0] 123 | elapsed_time_ms = (time.perf_counter() - start_time)*1000 124 | time_list.append(elapsed_time_ms) 125 | rate = args.size_mb / (elapsed_time_ms/1000) 126 | log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) 127 | 128 | min = np.min(time_list) 129 | median = np.median(time_list) 130 | log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 131 | 132 | 133 | def main(): 134 | if args.role == 'launcher': 135 | run_launcher() 136 | elif args.role == 'driver': 137 | run_driver() 138 | else: 139 | assert False, f"Unknown role {args.role}, must be laucher/driver" 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | -------------------------------------------------------------------------------- /benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | ray 2 | torch 3 | tensorflow 4 | -------------------------------------------------------------------------------- /benchmarks/summary.txt: -------------------------------------------------------------------------------- 1 | tf_two_machines -- 500 on t3, 910 on c3 2 | 3 | Ray can do 4 | 30ms on local transfers, 60ms on AWS c5.18xlarge 5 | Using multiple ps shards, can do 48ms on AWS 6 | 7 | 8 | 40ms on unoptimized PyTorch clone 9 | 2.7ms for optimized memcpy on skylake: 300 Gbps (37 GB/second, close to memory bandwidth) -- https://www.google.com/url?q=https://www.anandtech.com/show/11544/intel-skylake-ep-vs-amd-epyc-7000-cpu-battle-of-the-decade/12&source=gmail&ust=1537921524487000&usg=AFQjCNGUrAScjR_rAihauUr-nj5TMg-VKQ 10 | 11 | 12 | PyTorch backend can do 20 Gbps per thread on 13 | -------------------------------------------------------------------------------- /benchmarks/tf_two_machines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver. 6 | 7 | To run locally: 8 | ./tf_two_machines.py 9 | Should see something like this 10 | 11 | ``` 12 | 005/11 added 100 MBs in 78.9 ms: 1266.98 MB/second 13 | 006/11 added 100 MBs in 78.1 ms: 1280.07 MB/second 14 | 007/11 added 100 MBs in 78.1 ms: 1280.56 MB/second 15 | 008/11 added 100 MBs in 81.8 ms: 1222.76 MB/second 16 | 009/11 added 100 MBs in 79.5 ms: 1258.54 MB/second 17 | 010/11 added 100 MBs in 76.6 ms: 1305.64 MB/second 18 | min: 76.59, median: 78.80, mean: 88.34 19 | ``` 20 | 21 | To interact with task 1 (the driver), do "tmux a -t 1" 22 | 23 | To run on AWS 24 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 25 | ./tf_two_machines.py --aws 26 | 27 | Should see something like this with t3.large instances 28 | ``` 29 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second 30 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second 31 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second 32 | ``` 33 | 34 | Running c5.18xlarge machines with more iterations 35 | 007/11 sent 100 MBs in 135.4 ms: 738.47 MB/second 36 | 008/11 sent 100 MBs in 133.0 ms: 752.04 MB/second 37 | 009/11 sent 100 MBs in 133.8 ms: 747.48 MB/second 38 | 010/11 sent 100 MBs in 136.3 ms: 733.77 MB/second 39 | min: 132.97, median: 134.98, mean: 137.27 40 | 41 | 42 | Can use more shards 43 | ./tf_two_machines.py --aws --shards=8 --iters=1000 44 | 994/1000 sent 100 MBs in 87.0 ms: 1149.50 MB/second 45 | 995/1000 sent 100 MBs in 87.0 ms: 1149.21 MB/second 46 | 996/1000 sent 100 MBs in 86.8 ms: 1152.11 MB/second 47 | 997/1000 sent 100 MBs in 89.8 ms: 1113.89 MB/second 48 | 998/1000 sent 100 MBs in 87.9 ms: 1137.37 MB/second 49 | 999/1000 sent 100 MBs in 88.0 ms: 1135.80 MB/second 50 | min: 86.12, median: 88.48, mean: 89.51 51 | 52 | 53 | To connect and interact with the job look for SSH instructions like this 54 | To connect to 0.tf_two_machines 55 | ssh -i /Users/yaroslav/.ncluster/ncluster2-yaroslav-316880547378-us-east-1.pem -o StrictHostKeyChecking=no ubuntu@18.234.30.222 56 | 57 | ssh into the instance following these instructions, then run "tmux a" 58 | 59 | 60 | """ 61 | 62 | import argparse 63 | import json 64 | import os 65 | import numpy as np 66 | import tensorflow as tf 67 | import time 68 | 69 | import util 70 | 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 73 | parser.add_argument("--iters", default=11, type=int, 74 | help="Maximum number of additions") 75 | parser.add_argument("--size-mb", default=100, type=int, 76 | help="size of vector in MBs") 77 | parser.add_argument("--shards", default=1, type=int, 78 | help="how many ways to shard the variable") 79 | parser.add_argument('--image', 80 | default='Deep Learning AMI (Ubuntu) Version 15.0') 81 | parser.add_argument('--name', 82 | default='tf_two_machines') 83 | 84 | # internal flags 85 | parser.add_argument('--role', default='launcher', type=str) 86 | parser.add_argument("--sender-ip", default='127.0.0.1') 87 | parser.add_argument("--receiver-ip", default='127.0.0.1') 88 | args = parser.parse_args() 89 | 90 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 91 | 'receiver': [args.receiver_ip + ':32301']} 92 | 93 | 94 | def _launch_server(role): 95 | os.environ['TF_CONFIG'] = json.dumps( 96 | {'cluster': cluster_spec, 97 | 'task': {'type': role, 'index': 0}}) 98 | config = tf.estimator.RunConfig() 99 | return tf.train.Server(config.cluster_spec, 100 | job_name=config.task_type, 101 | task_index=config.task_id) 102 | 103 | 104 | def run_launcher(): 105 | import ncluster 106 | if args.aws: 107 | ncluster.set_backend('aws') 108 | 109 | job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) 110 | job.upload(__file__) 111 | job.upload('util.py') 112 | 113 | sender, receiver = job.tasks 114 | # kill python just for when tmux session reuse is on 115 | if not ncluster.running_locally(): 116 | sender._run_raw('killall python', ignore_errors=True) 117 | receiver._run_raw('killall python', ignore_errors=True) 118 | 119 | if ncluster.get_backend() == 'aws': 120 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 121 | job.run('source activate tensorflow_p36') 122 | 123 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 124 | receiver.run(f'python {__file__} --role=receiver {ip_config}', 125 | non_blocking=True) 126 | sender.run( 127 | f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') 128 | print(sender.read('out')) 129 | 130 | 131 | def run_receiver(): 132 | server = _launch_server('receiver') 133 | time.sleep(365 * 24 * 3600) 134 | del server 135 | 136 | 137 | def run_sender(): 138 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 139 | log = util.FileLogger('out') 140 | grads_array = [] 141 | with tf.device('/job:chief/task:0'): 142 | # grads = tf.fill([param_size], 1.) 143 | for i in range(args.shards): 144 | grads = tf.Variable(tf.ones([param_size])) 145 | grads_array.append(grads) 146 | 147 | params_array = [] 148 | add_op_array = [] 149 | with tf.device('/job:receiver/task:0'): 150 | for i in range(args.shards): 151 | params = tf.Variable(tf.ones([param_size])) 152 | add_op = params.assign(grads_array[i]).op 153 | params_array.append(params) 154 | add_op_array.append(add_op) 155 | add_op = tf.group(*add_op_array) 156 | 157 | server = _launch_server('chief') 158 | sess = tf.Session(server.target) 159 | sess.run(tf.global_variables_initializer()) 160 | # except Exception as e: 161 | # # sometimes .run fails with .UnavailableError: OS Error 162 | # log(f"initialization failed with {e}, retrying in 1 second") 163 | # time.sleep(1) 164 | 165 | time_list = [] 166 | for i in range(args.iters): 167 | start_time = time.perf_counter() 168 | sess.run(add_op) 169 | elapsed_time_ms = (time.perf_counter() - start_time) * 1000 170 | time_list.append(elapsed_time_ms) 171 | rate = args.size_mb / (elapsed_time_ms / 1000) 172 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % ( 173 | i, args.iters, args.size_mb, elapsed_time_ms, rate)) 174 | 175 | min = np.min(time_list) 176 | median = np.median(time_list) 177 | 178 | log( 179 | f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 180 | 181 | 182 | def main(): 183 | # run local benchmark in launcher and launch service 184 | if args.role == "launcher": 185 | run_launcher() 186 | elif args.role == "sender": 187 | run_sender() 188 | elif args.role == "receiver": 189 | run_receiver() 190 | else: 191 | assert False, 'unknown role' 192 | 193 | 194 | if __name__ == '__main__': 195 | main() 196 | -------------------------------------------------------------------------------- /benchmarks/tf_two_machines_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Runs distributed benchmark on a single machine remotely 4 | 5 | Adding 100MB buffers 6 | 7 | # 1 shard: 88ms 8 | # 4 shards: 56ms 9 | # 8 shards: 51ms 10 | # 16 shards: 55ms 11 | 12 | # increase size 8x 13 | python tf_two_machines_local.py --shards=8 --iters=100 --size-mb=800 --aws 14 | # 416ms 15 | 16 | Bottom line: 1.6-1.9 GB/second when running locally 17 | Going 1->4 shards saves 30%, 4->8 shards another 5% 18 | 19 | i3.metal 30% slower than c5.18xlarge 20 | 21 | """ 22 | 23 | import argparse 24 | import json 25 | import os 26 | import numpy as np 27 | import tensorflow as tf 28 | import time 29 | 30 | import util 31 | 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 34 | parser.add_argument("--iters", default=11, type=int, 35 | help="Maximum number of additions") 36 | parser.add_argument("--size-mb", default=100, type=int, 37 | help="size of vector in MBs") 38 | parser.add_argument("--shards", default=1, type=int, 39 | help="how many ways to shard the variable") 40 | parser.add_argument('--image', 41 | default='Deep Learning AMI (Ubuntu) Version 15.0') 42 | parser.add_argument('--name', 43 | default='tf_two_machines_local') 44 | 45 | # internal flags 46 | parser.add_argument('--role', default='launcher', type=str) 47 | parser.add_argument("--sender-ip", default='127.0.0.1') 48 | parser.add_argument("--receiver-ip", default='127.0.0.1') 49 | args = parser.parse_args() 50 | 51 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 52 | 'receiver': [args.receiver_ip + ':32301']} 53 | 54 | 55 | def _launch_server(role): 56 | os.environ['TF_CONFIG'] = json.dumps( 57 | {'cluster': cluster_spec, 58 | 'task': {'type': role, 'index': 0}}) 59 | config = tf.estimator.RunConfig() 60 | return tf.train.Server(config.cluster_spec, 61 | job_name=config.task_type, 62 | task_index=config.task_id) 63 | 64 | 65 | def run_launcher(): 66 | import ncluster 67 | ncluster.util.assert_script_in_current_directory() 68 | 69 | if args.aws: 70 | ncluster.set_backend('aws') 71 | 72 | # use 4GB instance, 0.5GB not enough 73 | worker = ncluster.make_task(args.name, image_name=args.image, 74 | instance_type='t3.medium') 75 | worker.upload(__file__) 76 | worker.upload('util.py') 77 | 78 | # kill python just for when tmux session reuse is on 79 | if not ncluster.running_locally(): 80 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 81 | worker._run_raw('killall python', ignore_errors=True) 82 | worker.run('source activate tensorflow_p36') 83 | 84 | ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' 85 | worker.run(f'python {__file__} --role=receiver {ip_config}', 86 | non_blocking=True) 87 | worker.switch_window(1) # run in new tmux window 88 | if not ncluster.running_locally(): 89 | worker.run('source activate tensorflow_p36') 90 | worker.run( 91 | f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') 92 | print(worker.read('out')) 93 | 94 | 95 | def run_receiver(): 96 | server = _launch_server('receiver') 97 | time.sleep(365 * 24 * 3600) 98 | del server 99 | 100 | 101 | def run_sender(): 102 | param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers 103 | log = util.FileLogger('out') 104 | grads_array = [] 105 | with tf.device('/job:chief/task:0'): 106 | # grads = tf.fill([param_size], 1.) 107 | for i in range(args.shards): 108 | grads = tf.Variable(tf.ones([param_size])) 109 | grads_array.append(grads) 110 | 111 | params_array = [] 112 | add_op_array = [] 113 | with tf.device('/job:receiver/task:0'): 114 | for i in range(args.shards): 115 | params = tf.Variable(tf.ones([param_size])) 116 | add_op = params.assign(grads_array[i]).op 117 | params_array.append(params) 118 | add_op_array.append(add_op) 119 | add_op = tf.group(*add_op_array) 120 | 121 | server = _launch_server('chief') 122 | sess = tf.Session(server.target) 123 | sess.run(tf.global_variables_initializer()) 124 | # except Exception as e: 125 | # # sometimes .run fails with .UnavailableError: OS Error 126 | # log(f"initialization failed with {e}, retrying in 1 second") 127 | # time.sleep(1) 128 | 129 | time_list = [] 130 | for i in range(args.iters): 131 | start_time = time.perf_counter() 132 | sess.run(add_op) 133 | elapsed_time_ms = (time.perf_counter() - start_time) * 1000 134 | time_list.append(elapsed_time_ms) 135 | rate = args.size_mb / (elapsed_time_ms / 1000) 136 | log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % ( 137 | i, args.iters, args.size_mb, elapsed_time_ms, rate)) 138 | 139 | min = np.min(time_list) 140 | median = np.median(time_list) 141 | 142 | log( 143 | f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}") 144 | 145 | 146 | def main(): 147 | # run local benchmark in launcher and launch service 148 | if args.role == "launcher": 149 | run_launcher() 150 | elif args.role == "sender": 151 | run_sender() 152 | elif args.role == "receiver": 153 | run_receiver() 154 | else: 155 | assert False, 'unknown role' 156 | 157 | 158 | if __name__ == '__main__': 159 | main() 160 | -------------------------------------------------------------------------------- /benchmarks/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | 6 | class FileLogger: 7 | """Helper class to log to file (possibly mirroring to stderr) 8 | logger = FileLogger('somefile.txt') 9 | logger = FileLogger('somefile.txt', mirror=True) 10 | logger('somemessage') 11 | logger('somemessage: %s %.2f', 'value', 2.5) 12 | """ 13 | 14 | def __init__(self, fn, mirror=True): 15 | self.fn = fn 16 | self.f = open(fn, 'w') 17 | self.mirror = mirror 18 | print(f"Creating FileLogger on {os.path.abspath(fn)}") 19 | 20 | def __call__(self, s='', *args): 21 | """Either ('asdf %f', 5) or (val1, val2, val3, ...)""" 22 | if (isinstance(s, str) or isinstance(s, bytes)) and '%' in s: 23 | formatted_s = s % args 24 | else: 25 | toks = [s] + list(args) 26 | formatted_s = ', '.join(str(s) for s in toks) 27 | 28 | self.f.write(formatted_s + '\n') 29 | self.f.flush() 30 | if self.mirror: 31 | # use manual flushing because "|" makes output 4k buffered instead of 32 | # line-buffered 33 | sys.stdout.write(formatted_s+'\n') 34 | sys.stdout.flush() 35 | 36 | def __del__(self): 37 | self.f.close() 38 | 39 | 40 | def ossystem(cmd): 41 | """Like os.system, but returns output of command as string.""" 42 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, 43 | stderr=subprocess.STDOUT) 44 | (stdout, stderr) = p.communicate() 45 | return stdout.decode('ascii') 46 | -------------------------------------------------------------------------------- /examples/deleteme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | print(sys.argv[0]) 4 | -------------------------------------------------------------------------------- /examples/gpubox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Launch a single GPU instance with jupyter notebook 4 | 5 | import argparse 6 | import os 7 | import ncluster 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--name', type=str, default='gpubox', 11 | help="instance name") 12 | parser.add_argument('--image-name', type=str, 13 | default='Deep Learning AMI (Ubuntu) Version 15.0', 14 | help="name of AMI to use ") 15 | parser.add_argument('--instance-type', type=str, default='p3.2xlarge', 16 | help="type of instance") 17 | parser.add_argument('--password', 18 | default='DefaultNotebookPasswordPleaseChange', 19 | help='password to use for jupyter notebook') 20 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 21 | 22 | args = parser.parse_args() 23 | module_path = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | if args.aws: 26 | ncluster.set_backend('aws') 27 | 28 | def main(): 29 | task = ncluster.make_task(name=args.name, 30 | instance_type=args.instance_type, 31 | image_name=args.image_name) 32 | 33 | # upload notebook config with provided password 34 | jupyter_config_fn = _create_jupyter_config(args.password) 35 | remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' 36 | task.upload(jupyter_config_fn, remote_config_fn) 37 | 38 | # upload sample notebook and start Jupyter server 39 | task.run('mkdir -p /ncluster/notebooks') 40 | task.upload(f'{module_path}/gpubox_sample.ipynb', 41 | '/ncluster/notebooks/gpubox_sample.ipynb', 42 | dont_overwrite=True) 43 | task.run('cd /ncluster/notebooks') 44 | task.run('jupyter notebook', non_blocking=True) 45 | print(f'Jupyter notebook will be at http://{task.public_ip}:8888') 46 | 47 | 48 | def _create_jupyter_config(password): 49 | from notebook.auth import passwd 50 | sha = passwd(args.password) 51 | local_config_fn = f'{module_path}/gpubox_jupyter_notebook_config.py' 52 | temp_config_fn = '/tmp/' + os.path.basename(local_config_fn) 53 | os.system(f'cp {local_config_fn} {temp_config_fn}') 54 | _replace_lines(temp_config_fn, 'c.NotebookApp.password', 55 | f"c.NotebookApp.password = '{sha}'") 56 | return temp_config_fn 57 | 58 | 59 | def _replace_lines(fn, startswith, new_line): 60 | """Replace lines starting with starts_with in fn with new_line.""" 61 | new_lines = [] 62 | for line in open(fn): 63 | if line.startswith(startswith): 64 | new_lines.append(new_line) 65 | else: 66 | new_lines.append(line) 67 | with open(fn, 'w') as f: 68 | f.write('\n'.join(new_lines)) 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /examples/gpubox_sample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Mon Aug 13 23:41:40 2018 \r\n", 13 | "+-----------------------------------------------------------------------------+\r\n", 14 | "| NVIDIA-SMI 396.37 Driver Version: 396.37 |\r\n", 15 | "|-------------------------------+----------------------+----------------------+\r\n", 16 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", 17 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", 18 | "|===============================+======================+======================|\r\n", 19 | "| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 |\r\n", 20 | "| N/A 43C P8 14W / 150W | 0MiB / 7618MiB | 0% Default |\r\n", 21 | "+-------------------------------+----------------------+----------------------+\r\n", 22 | " \r\n", 23 | "+-----------------------------------------------------------------------------+\r\n", 24 | "| Processes: GPU Memory |\r\n", 25 | "| GPU PID Type Process name Usage |\r\n", 26 | "|=============================================================================|\r\n", 27 | "| No running processes found |\r\n", 28 | "+-----------------------------------------------------------------------------+\r\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "!nvidia-smi" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [default]", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.6.4" 61 | }, 62 | "toc": { 63 | "colors": { 64 | "hover_highlight": "#DAA520", 65 | "running_highlight": "#FF0000", 66 | "selected_highlight": "#FFD700" 67 | }, 68 | "moveMenuLeft": true, 69 | "nav_menu": { 70 | "height": "12px", 71 | "width": "252px" 72 | }, 73 | "navigate_menu": true, 74 | "number_sections": true, 75 | "sideBar": true, 76 | "threshold": 4, 77 | "toc_cell": false, 78 | "toc_section_display": "block", 79 | "toc_window_display": false 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /examples/launch_16_instances.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import time 3 | 4 | def main(): 5 | ncluster.set_backend('aws') 6 | 7 | start_time = time.time() 8 | job = ncluster.make_job(num_tasks=16) 9 | print(f"waited for startup for {time.time()-start_time} seconds") 10 | 11 | start_time = time.time() 12 | job.run('sleep 10') 13 | print(f"waited for exec for {time.time()-start_time} seconds") 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /examples/ray_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Example of two process Ray program, worker sends values to parameter 4 | # server on a different machine 5 | # 6 | # Run locally: 7 | # ./ray_example.py 8 | # 9 | # Run on AWS: 10 | # ./ray_example.py --aws 11 | 12 | import argparse 13 | import os 14 | import time 15 | 16 | import numpy as np 17 | import ray 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--role", default='launcher', type=str, 21 | help="launcher/driver") 22 | parser.add_argument('--image', default='Deep Learning AMI (Ubuntu) Version 13.0') 23 | parser.add_argument("--size-mb", default=10, type=int, help='how much data to send at each iteration') 24 | parser.add_argument("--iters", default=10, type=int) 25 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 26 | parser.add_argument("--ip", default='', type=str, 27 | help="internal flag, used to point worker to head node") 28 | 29 | args = parser.parse_args() 30 | 31 | dim = args.size_mb * 250 * 1000 32 | 33 | 34 | @ray.remote(resources={"worker": 1}) 35 | class Worker(object): 36 | def __init__(self): 37 | self.gradients = np.ones(dim, dtype=np.float32) 38 | 39 | def compute_gradients(self): 40 | return self.gradients 41 | 42 | 43 | @ray.remote(resources={"ps": 1}) 44 | class ParameterServer(object): 45 | def __init__(self): 46 | self.params = np.zeros(dim, dtype=np.float32) 47 | 48 | def assign_add(self, grad): 49 | self.params += grad 50 | return self.params 51 | 52 | def get_weights(self): 53 | return self.params 54 | 55 | 56 | def run_launcher(): 57 | import ncluster 58 | 59 | if args.aws: 60 | ncluster.set_backend('aws') 61 | 62 | script = os.path.basename(__file__) 63 | assert script in os.listdir('.') 64 | job = ncluster.make_job(install_script='pip install ray', 65 | image_name=args.image, 66 | instance_type='c5.large', 67 | num_tasks=2) 68 | job.upload(script) 69 | job.run('export RAY_USE_XRAY=1') 70 | job.run('ray stop') 71 | 72 | # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources 73 | ps_resource = """--resources='{"ps": 1}'""" 74 | worker_resource = """--resources='{"worker": 1}'""" 75 | ps, worker = job.tasks 76 | ps.run(f"ray start --head {ps_resource} --redis-port=6379") 77 | worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") 78 | worker.run(f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') 79 | 80 | 81 | def run_driver(): 82 | ray.init(redis_address=args.ip) 83 | 84 | worker = Worker.remote() 85 | ps = ParameterServer.remote() 86 | 87 | for iteration in range(args.iters): 88 | start_time = time.time() 89 | grads = worker.compute_gradients.remote() 90 | result = ps.assign_add.remote(grads) 91 | result = ray.get(result)[0] 92 | elapsed_time = time.time() - start_time 93 | rate = args.size_mb / elapsed_time 94 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (result, args.iters, args.size_mb, elapsed_time * 1000, rate)) 95 | 96 | 97 | def main(): 98 | if args.role == 'launcher': 99 | run_launcher() 100 | elif args.role == 'driver': 101 | run_driver() 102 | else: 103 | assert False, f"Unknown role {args.role}, must be laucher/driver" 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter # for notebook.auth.passwd 2 | tensorflow 3 | torch 4 | ray 5 | -------------------------------------------------------------------------------- /examples/simple_job.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import time 3 | 4 | def main(): 5 | ncluster.set_backend('local') 6 | 7 | job = ncluster.make_job(num_tasks=2) 8 | 9 | start_time = time.time() 10 | job.run('sleep 1') 11 | print(f"waited for {time.time()-start_time} seconds") 12 | 13 | if __name__ == '__main__': 14 | main() 15 | -------------------------------------------------------------------------------- /examples/simple_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import ncluster 3 | 4 | # allocate default machine type and default image 5 | task = ncluster.make_task() 6 | output = task.run('ifconfig') 7 | print(f"Task ifconfig returned {output}") 8 | -------------------------------------------------------------------------------- /examples/simple_tf.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | 4 | if not sys.argv[1:]: 5 | import ncluster 6 | task = ncluster.make_task(instance_type='t3.micro') 7 | task.upload(__file__) 8 | task.run('pip install tensorflow') 9 | task.run(f'python {__file__} worker') 10 | elif sys.argv[1] == 'worker': 11 | import tensorflow as tf 12 | import os 13 | sess = tf.Session() 14 | ones = tf.ones((1000,1000)) 15 | result = sess.run(tf.matmul(ones, ones)) 16 | print(f"matmul gave {result.sum()}") 17 | os.system('sudo shutdown -h -P 10') # shut down the instance in 10 mins 18 | 19 | -------------------------------------------------------------------------------- /examples/tf_adder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver. 6 | 7 | To run locally: 8 | ./tf_adder.py 9 | tmux a -t 0 10 | 11 | Should see something like this 12 | ``` 13 | 089/100 added 100 MBs in 114.9 ms: 1114.36 MB/second 14 | 090/100 added 100 MBs in 113.4 ms: 1128.61 MB/second 15 | 091/100 added 100 MBs in 113.4 ms: 1128.60 MB/second 16 | ``` 17 | 18 | 19 | To run on AWS 20 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 21 | ./tf_adder.py --aws 22 | nconnect 0.tf_adder 23 | 24 | Should see something like this with t3.large instances 25 | ``` 26 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second 27 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second 28 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second 29 | ``` 30 | 31 | """ 32 | 33 | import argparse 34 | import json 35 | import os 36 | import tensorflow as tf 37 | import time 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS") 41 | parser.add_argument("--iters", default=20, type=int, help="Maximum number of additions") 42 | parser.add_argument("--data-mb", default=100, type=int, help="size of vector in MBs") 43 | parser.add_argument('--image', 44 | default='Deep Learning AMI (Ubuntu) Version 15.0') 45 | 46 | # internal flags 47 | parser.add_argument('--role', default='launcher', type=str) 48 | parser.add_argument("--sender-ip", default='127.0.0.1') 49 | parser.add_argument("--receiver-ip", default='127.0.0.1') 50 | args = parser.parse_args() 51 | 52 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 53 | 'receiver': [args.receiver_ip + ':32301']} 54 | 55 | 56 | def _launch_server(role): 57 | os.environ['TF_CONFIG'] = json.dumps( 58 | {'cluster': cluster_spec, 59 | 'task': {'type': role, 'index': 0}}) 60 | config = tf.estimator.RunConfig() 61 | return tf.train.Server(config.cluster_spec, 62 | job_name=config.task_type, 63 | task_index=config.task_id) 64 | 65 | 66 | def run_launcher(): 67 | import ncluster 68 | if args.aws: 69 | ncluster.set_backend('aws') 70 | 71 | job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image) 72 | job.upload(__file__) 73 | 74 | sender, receiver = job.tasks 75 | if ncluster.get_backend() == 'aws': 76 | # on AWS probably running in conda DLAMI, switch into TF-enabled env 77 | job.run('source activate tensorflow_p36') 78 | 79 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 80 | receiver.run(f'python tf_adder.py --role=receiver {ip_config}', 81 | non_blocking=True) 82 | sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}') 83 | 84 | 85 | def run_receiver(): 86 | server = _launch_server('receiver') 87 | time.sleep(365 * 24 * 3600) 88 | del server 89 | 90 | 91 | def run_sender(): 92 | param_size = 250 * 1000 * args.data_mb # 1MB is 250k integers 93 | with tf.device('/job:chief/task:0'): 94 | grads = tf.fill([param_size], 1.) 95 | 96 | with tf.device('/job:receiver/task:0'): 97 | params = tf.Variable(tf.ones([param_size])) 98 | add_op = params.assign_add(grads).op 99 | 100 | server = _launch_server('chief') 101 | sess = tf.Session(server.target) 102 | 103 | sess.run(tf.global_variables_initializer()) 104 | 105 | for i in range(args.iters): 106 | start_time = time.time() 107 | sess.run(add_op) 108 | elapsed_time = time.time() - start_time 109 | rate = args.data_mb / elapsed_time 110 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate)) 111 | 112 | 113 | def main(): 114 | # run local benchmark in launcher and launch service 115 | if args.role == "launcher": 116 | run_launcher() 117 | elif args.role == "sender": 118 | run_sender() 119 | elif args.role == "receiver": 120 | run_receiver() 121 | else: 122 | assert False, 'unknown role' 123 | 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /examples/tf_adder_tb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | TensorFlow distributed benchmark + TensorBoard. Create sender/receiver tasks and add arrays from sender tasks to 6 | variable on receiver. 7 | 8 | To run locally: 9 | ./tf_adder_tb.py 10 | 11 | Should see something like this 12 | ``` 13 | ... 14 | Benchmark done, tensorboard at http://127.0.0.1:6006 15 | ``` 16 | 17 | 18 | To run on AWS 19 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION 20 | 21 | ./tf_adder_tb.py --aws 22 | 23 | After a minute should see something like this 24 | 25 | ... 26 | Benchmark done, tensorboard at http://35.173.134.87:6006 27 | """ 28 | 29 | import argparse 30 | import json 31 | import os 32 | import tensorflow as tf 33 | import time 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--role', default='launcher', type=str) 37 | parser.add_argument("--iters", default=20, help="Maximum number of additions") 38 | parser.add_argument("--data-mb", default=128, help="size of vector in MBs") 39 | parser.add_argument("--sender-ip", default='127.0.0.1') 40 | parser.add_argument("--receiver-ip", default='127.0.0.1') 41 | parser.add_argument("--logdir", help='logging directory') 42 | parser.add_argument("--aws", action='store_true') 43 | parser.add_argument('--image', default='Deep Learning AMI (Amazon Linux) Version 13.0') 44 | args = parser.parse_args() 45 | 46 | cluster_spec = {'chief': [args.sender_ip + ':32300'], 47 | 'receiver': [args.receiver_ip + ':32301']} 48 | 49 | 50 | def _launch_server(role): 51 | os.environ['TF_CONFIG'] = json.dumps( 52 | {'cluster': cluster_spec, 53 | 'task': {'type': role, 'index': 0}}) 54 | config = tf.estimator.RunConfig() 55 | return tf.train.Server(config.cluster_spec, 56 | job_name=config.task_type, 57 | task_index=config.task_id) 58 | 59 | 60 | def run_launcher(): 61 | import ncluster 62 | 63 | if args.aws: 64 | ncluster.set_backend('aws') 65 | job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image) 66 | job.upload(__file__) 67 | this_file = os.path.basename(__file__) 68 | 69 | sender, receiver = job.tasks 70 | if ncluster.get_backend() == 'aws': 71 | # on AWS probably are running in DLAMI, switch into TF-enabled env 72 | job.run('source activate tensorflow_p36') 73 | 74 | ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' 75 | job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) 76 | job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') 77 | job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) 78 | print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006") 79 | 80 | 81 | def run_receiver(): 82 | server = _launch_server('receiver') 83 | time.sleep(365 * 24 * 3600) 84 | del server 85 | 86 | 87 | def run_sender(): 88 | summary_writer = tf.summary.FileWriter(args.logdir) 89 | 90 | param_size = 250 * 1000 * args.data_mb # 1MB is 250k integers 91 | with tf.device('/job:chief/task:0'): 92 | grads = tf.fill([param_size], 1.) 93 | 94 | with tf.device('/job:receiver/task:0'): 95 | params = tf.Variable(tf.ones([param_size])) 96 | add_op = params.assign_add(grads).op 97 | 98 | server = _launch_server('chief') 99 | sess = tf.Session(server.target) 100 | 101 | sess.run(tf.global_variables_initializer()) 102 | 103 | for i in range(args.iters): 104 | start_time = time.time() 105 | sess.run(add_op) 106 | elapsed_time = time.time() - start_time 107 | rate = args.data_mb / elapsed_time 108 | print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate)) 109 | summary = tf.Summary() 110 | summary.value.add(tag='time_ms', simple_value=elapsed_time * 1000) 111 | summary_writer.add_summary(summary, i) 112 | 113 | summary_writer.close() 114 | 115 | 116 | def main(): 117 | # run local benchmark in launcher and launch service 118 | if args.role == "launcher": 119 | run_launcher() 120 | elif args.role == "sender": 121 | run_sender() 122 | elif args.role == "receiver": 123 | run_receiver() 124 | else: 125 | assert False, 'unknown role' 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /ncluster/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from . import aws_backend 4 | from . import aws_util 5 | from . import util 6 | from . import local_backend 7 | from . import backend # TODO: remove? 8 | 9 | from .ncluster import get_backend 10 | from .ncluster import set_backend 11 | from .ncluster import running_locally 12 | 13 | from .ncluster import use_aws 14 | from .ncluster import use_local 15 | 16 | from .ncluster import make_task 17 | from .ncluster import make_job 18 | from .ncluster import make_run 19 | from .ncluster import get_zone 20 | from .ncluster import get_region 21 | from .ncluster import set_logdir_root 22 | from .ncluster import get_logdir_root 23 | 24 | 25 | # set default backend from environment 26 | if 'NCLUSTER_BACKEND' in os.environ: 27 | set_backend(os.environ['NCLUSTER_BACKEND']) 28 | else: 29 | set_backend('local') 30 | 31 | util.install_pdb_handler() # CTRL+\ drops into pdb 32 | -------------------------------------------------------------------------------- /ncluster/aws_backend.py: -------------------------------------------------------------------------------- 1 | """AWS implementation of backend.py 2 | 3 | Not thread-safe 4 | """ 5 | import glob 6 | import os 7 | import pprint 8 | import shlex 9 | import signal 10 | import stat 11 | import threading 12 | import time 13 | from typing import Tuple, List 14 | 15 | import paramiko 16 | 17 | from ncluster import ncluster_globals 18 | 19 | from . import aws_create_resources as create_lib 20 | from . import aws_util as u 21 | from . import backend 22 | from . import util 23 | 24 | TMPDIR = '/tmp/ncluster' # location for temp files on launching machine 25 | AWS_LOCK_FN = '/tmp/aws.lock' # lock file used to prevent concurrent creation of AWS resources by multiple workers in parallel 26 | NCLUSTER_DEFAULT_REGION = 'us-east-1' # used as last resort if no other method set a region 27 | LOGDIR_ROOT = '/ncluster/runs' 28 | 29 | # some image which is fast to load, to use for quick runs 30 | GENERIC_SMALL_IMAGE = 'amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2' 31 | 32 | 33 | class Task(backend.Task): 34 | """AWS task is initialized with an AWS instance and handles initialization, 35 | creation of SSH session, shutdown""" 36 | last_status: int # status of last command executed 37 | 38 | tmux_window_id: int 39 | tmux_available_window_ids: List[int] 40 | 41 | sftp: paramiko.SFTPClient 42 | 43 | def __init__(self, name, *, instance, install_script='', image_name='', 44 | **extra_kwargs): 45 | """ 46 | Initializes Task on top of existing AWS instance. Blocks until instance is ready to execute 47 | shell commands. 48 | 49 | Args: 50 | name: task name 51 | instance: ec2.Instance object (https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#instance) 52 | install_script: 53 | image_name: AWS image name 54 | **extra_kwargs: unused kwargs (kept for compatibility with other backends) 55 | """ 56 | self._cmd_fn = None 57 | self._cmd = None 58 | self._status_fn = None # location of output of last status 59 | self.last_status = -1 60 | 61 | self._can_run = False # indicates that things needed for .run were created 62 | self.initialize_called = False 63 | 64 | self.name = name 65 | self.instance = instance 66 | self.install_script = install_script 67 | self.extra_kwargs = extra_kwargs 68 | 69 | self.public_ip = u.get_public_ip(instance) 70 | self.ip = u.get_ip(instance) 71 | self.sftp = None 72 | self._linux_type = 'ubuntu' 73 | 74 | # heuristic to tell if I'm using Amazon image name 75 | # default image has name like 'amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2' 76 | if 'amzn' in image_name.lower() or 'amazon' in image_name.lower(): 77 | self.log('Detected Amazon Linux image') 78 | self._linux_type = 'amazon' 79 | self.run_counter = 0 80 | 81 | launch_id = util.random_id() 82 | self.local_scratch = f"{TMPDIR}/{name}-{launch_id}" 83 | self.remote_scratch = f"{TMPDIR}/{name}-{launch_id}" 84 | 85 | os.system('mkdir -p ' + self.local_scratch) 86 | 87 | self._initialized_fn = f'is_initialized' 88 | 89 | # _current_directory tracks current directory on task machine 90 | # used for uploading without specifying absolute path on target machine 91 | if self._linux_type == 'ubuntu': 92 | # self._current_directory = '/home/ubuntu' 93 | self.ssh_username = 'ubuntu' # default username on task machine 94 | elif self._linux_type == 'amazon': 95 | # self._current_directory = '/home/ec2-user' 96 | self.ssh_username = 'ec2-user' 97 | self.homedir = '/home/' + self.ssh_username 98 | 99 | self.ssh_client = u.ssh_to_task(self) 100 | self._setup_tmux() 101 | self._run_raw('mkdir -p ' + self.remote_scratch) 102 | 103 | self._can_run = True 104 | 105 | if self._is_initialized_fn_present(): 106 | self.log("reusing previous initialized state") 107 | else: 108 | self.log("running install script") 109 | 110 | # bin/bash needed to make self-executable or use with UserData 111 | self.install_script = '#!/bin/bash\n' + self.install_script 112 | self.install_script += f'\necho ok > {self._initialized_fn}\n' 113 | self.file_write('install.sh', util.shell_add_echo(self.install_script)) 114 | self.run('bash -e install.sh') # fail on errors 115 | assert self._is_initialized_fn_present(), f"Install script didn't write to {self._initialized_fn}" 116 | 117 | self._mount_efs() 118 | self.connect_instructions = f""" 119 | To connect to {self.name} 120 | ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no {self.ssh_username}@{self.public_ip} 121 | tmux a 122 | """.strip() 123 | self.log("Initialize complete") 124 | self.log(self.connect_instructions) 125 | 126 | def _is_initialized_fn_present(self): 127 | self.log("Checking for initialization status") 128 | try: 129 | return 'ok' in self.read(self._initialized_fn) 130 | except Exception: 131 | return False 132 | 133 | def _setup_tmux(self): 134 | self.log("Setting up tmux") 135 | 136 | self.tmux_session = self.name.replace('.', '=') 137 | self.tmux_window_id = 0 138 | self.tmux_available_window_ids = [0] 139 | 140 | tmux_cmd = [f'tmux set-option -g history-limit 50000 \; ', 141 | f'set-option -g mouse on \; ', 142 | f'new-session -s {self.tmux_session} -n 0 -d'] 143 | 144 | # hack to get around Amazon linux not having tmux 145 | if self._linux_type == 'amazon': 146 | self._run_raw('sudo yum install tmux -y') 147 | del tmux_cmd[1] # Amazon tmux is really old, no mouse option 148 | 149 | if not util.is_set("NCLUSTER_NOKILL_TMUX"): 150 | self._run_raw(f'tmux kill-session -t {self.tmux_session}', 151 | ignore_errors=True) 152 | else: 153 | print( 154 | "Warning, NCLUSTER_NOKILL_TMUX is on, make sure remote tmux prompt is available or things will hang") 155 | 156 | self._run_raw(''.join(tmux_cmd)) 157 | 158 | self._can_run = True 159 | 160 | def _mount_efs(self): 161 | self.log("Mounting EFS") 162 | region = u.get_region() 163 | efs_id = u.get_efs_dict()[u.get_prefix()] 164 | dns = f"{efs_id}.efs.{region}.amazonaws.com" 165 | self.run('sudo mkdir -p /ncluster') 166 | 167 | # ignore error on remount (efs already mounted) 168 | self.run( 169 | f"sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {dns}:/ /ncluster", 170 | ignore_errors=True) 171 | 172 | # sometimes mount command doesn't work, make sure it's really mounted before returning 173 | stdout, stderr = self.run_with_output('df') 174 | while '/ncluster' not in stdout: 175 | sleep_sec = 2 176 | util.log(f"EFS not yet mounted, sleeping {sleep_sec} seconds") 177 | time.sleep(sleep_sec) 178 | self.run( 179 | f"sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {dns}:/ /ncluster", 180 | ignore_errors=True) 181 | stdout, stderr = self.run_with_output('df') 182 | 183 | self.run('sudo chmod 777 /ncluster') 184 | 185 | # Hack below may no longer be needed 186 | # # make sure chmod is successful, hack to fix occasional permission errors 187 | # while 'drwxrwxrwx' not in self.run_and_capture_output('ls -ld /ncluster'): 188 | # print(f"chmod 777 /ncluster didn't take, retrying in {TIMEOUT_SEC}") 189 | # time.sleep(TIMEOUT_SEC) 190 | # self.run('sudo chmod 777 /ncluster') 191 | 192 | # TODO(y): build a pstree and warn if trying to run something while main tmux bash has a subprocess running 193 | # this would ensure that commands being sent are not being swallowed 194 | 195 | def run(self, cmd, non_blocking=False, ignore_errors=False, 196 | max_wait_sec=365 * 24 * 3600, 197 | check_interval=0.2): 198 | 199 | # TODO(y): make _run_with_output_on_failure default, and delete this 200 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE') or True: 201 | # experimental version that captures output and prints it on failure 202 | # redirection things break bash commands, so 203 | # don't redirect on bash commands like source 204 | # TODO(y): remove this, put in this filtering becase I thought it broke 205 | # source activate, but now it seems it doesn't 206 | if not util.is_bash_builtin(cmd) or True: 207 | return self._run_with_output_on_failure(cmd, non_blocking, 208 | ignore_errors, 209 | max_wait_sec) 210 | else: 211 | self.log("Found bash built-in, using regular run") 212 | 213 | if not self._can_run: 214 | assert False, "Using .run before initialization finished" 215 | 216 | if '\n' in cmd: 217 | cmds = cmd.split('\n') 218 | self.log( 219 | f"Running {len(cmds)} commands at once, returning status of last") 220 | status = -1 221 | for subcmd in cmds: 222 | status = self.run(subcmd) 223 | self.last_status = status 224 | return status 225 | 226 | cmd = cmd.strip() 227 | if cmd.startswith('#'): # ignore empty/commented out lines 228 | return -1 229 | self.run_counter += 1 230 | self.log("tmux> %s", cmd) 231 | 232 | self._cmd = cmd 233 | self._cmd_fn = f'{self.remote_scratch}/{self.run_counter}.cmd' 234 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 235 | 236 | cmd = util.shell_strip_comment(cmd) 237 | assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 238 | 239 | # modify command to dump shell success status into file 240 | self.file_write(self._cmd_fn, cmd + '\n') 241 | modified_cmd = f'{cmd}; echo $? > {self._status_fn}' 242 | modified_cmd = shlex.quote(modified_cmd) 243 | 244 | tmux_window = self.tmux_session + ':' + str(self.tmux_window_id) 245 | tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter' 246 | self._run_raw(tmux_cmd, ignore_errors=ignore_errors) 247 | if non_blocking: 248 | return 0 249 | 250 | if not self.wait_for_file(self._status_fn, max_wait_sec=30): 251 | self.log(f"Retrying waiting for {self._status_fn}") 252 | while not self.exists(self._status_fn): 253 | self.log(f"Still waiting for {cmd}") 254 | self.wait_for_file(self._status_fn, max_wait_sec=30) 255 | contents = self.read(self._status_fn) 256 | 257 | # if empty wait a bit to allow for race condition 258 | if len(contents) == 0: 259 | time.sleep(check_interval) 260 | contents = self.read(self._status_fn) 261 | status = int(contents.strip()) 262 | self.last_status = status 263 | 264 | if status != 0: 265 | if not ignore_errors: 266 | raise RuntimeError(f"Command {cmd} returned status {status}") 267 | else: 268 | self.log(f"Warning: command {cmd} returned status {status}") 269 | 270 | return status 271 | 272 | def join(self, ignore_errors=False): 273 | """Waits until last executed command completed.""" 274 | assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it" 275 | check_interval = 0.2 276 | status_fn = self._status_fn 277 | if not self.wait_for_file(status_fn, max_wait_sec=30): 278 | self.log(f"Retrying waiting for {status_fn}") 279 | while not self.exists(status_fn): 280 | self.log(f"Still waiting for {self._cmd}") 281 | self.wait_for_file(status_fn, max_wait_sec=30) 282 | contents = self.read(status_fn) 283 | 284 | # if empty wait a bit to allow for race condition 285 | if len(contents) == 0: 286 | time.sleep(check_interval) 287 | contents = self.read(status_fn) 288 | status = int(contents.strip()) 289 | self.last_status = status 290 | 291 | if status != 0: 292 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 293 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE') or True: 294 | self.log( 295 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 296 | self.log(f"\n{'*'*80}\nEnd failing output") 297 | if not ignore_errors: 298 | raise RuntimeError(f"Command {self._cmd} returned status {status}") 299 | else: 300 | self.log(f"Warning: command {self._cmd} returned status {status}") 301 | 302 | return status 303 | 304 | def _run_with_output_on_failure(self, cmd, non_blocking=False, 305 | ignore_errors=False, 306 | max_wait_sec=365 * 24 * 3600, 307 | check_interval=0.2) -> str: 308 | """Experimental version of run propagates error messages to client. This command will be default "run" eventually""" 309 | 310 | if not self._can_run: 311 | assert False, "Using .run before initialization finished" 312 | 313 | if '\n' in cmd: 314 | assert False, "Don't support multi-line for run2" 315 | 316 | cmd = cmd.strip() 317 | if cmd.startswith('#'): # ignore empty/commented out lines 318 | return '' 319 | self.run_counter += 1 320 | self.log("tmux> %s", cmd) 321 | 322 | self._cmd = cmd 323 | self._cmd_fn = f'{self.remote_scratch}/{self.run_counter}.cmd' 324 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 325 | self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out' 326 | 327 | cmd = util.shell_strip_comment(cmd) 328 | assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 329 | 330 | # modify command to dump shell success status into file 331 | self.file_write(self._cmd_fn, cmd + '\n') 332 | 333 | # modified_cmd = f'{cmd} > {out_fn} 2>&1; echo $? > {status_fn}' 334 | # https://stackoverflow.com/a/692407/419116 335 | # $cmd > >(tee -a fn) 2> >(tee -a fn >&2) 336 | 337 | modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}' 338 | modified_cmd = shlex.quote(modified_cmd) 339 | 340 | start_time = time.time() 341 | tmux_window = self.tmux_session + ':' + str(self.tmux_window_id) 342 | tmux_cmd = f"tmux send-keys -t {tmux_window} {modified_cmd} Enter" 343 | self._run_raw(tmux_cmd, ignore_errors=ignore_errors) 344 | if non_blocking: 345 | return 0 346 | 347 | if not self.wait_for_file(self._status_fn, max_wait_sec=60): 348 | self.log(f"Retrying waiting for {self._status_fn}") 349 | elapsed_time = time.time() - start_time 350 | while not self.exists(self._status_fn) and elapsed_time < max_wait_sec: 351 | self.log(f"Still waiting for {cmd}") 352 | self.wait_for_file(self._status_fn, max_wait_sec=60) 353 | elapsed_time = time.time() - start_time 354 | contents = self.read(self._status_fn) 355 | 356 | # if empty wait a bit to allow for race condition 357 | if len(contents) == 0: 358 | time.sleep(check_interval) 359 | contents = self.read(self._status_fn) 360 | status = int(contents.strip()) 361 | self.last_status = status 362 | 363 | if status != 0: 364 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 365 | self.log( 366 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 367 | self.log(f"\n{'*'*80}\nEnd failing output") 368 | if not ignore_errors: 369 | raise RuntimeError(f"Command {cmd} returned status {status}") 370 | else: 371 | self.log(f"Warning: command {cmd} returned status {status}") 372 | 373 | return self.read(self._out_fn) 374 | 375 | def _run_raw(self, cmd: str, ignore_errors=False) -> Tuple[str, str]: 376 | """Runs given cmd in the task using current SSH session, returns 377 | stdout/stderr as strings. Because it blocks until cmd is done, use it for 378 | short cmds. Silently ignores failing commands. 379 | 380 | This is a barebones method to be used during initialization that have 381 | minimal dependencies (no tmux) 382 | """ 383 | # self._log("run_ssh: %s"%(cmd,)) 384 | 385 | stdin, stdout, stderr = u.call_with_retries(self.ssh_client.exec_command, 386 | command=cmd, get_pty=True) 387 | stdout_str = stdout.read().decode() 388 | stderr_str = stderr.read().decode() 389 | if stdout.channel.recv_exit_status() != 0: 390 | if not ignore_errors: 391 | self.log(f"command ({cmd}) failed with --->") 392 | self.log("failing stdout: " + stdout_str) 393 | self.log("failing stderr: " + stderr_str) 394 | assert False, "_run_raw failed (see logs for error)" 395 | 396 | return stdout_str, stderr_str 397 | 398 | def upload(self, local_fn: str, remote_fn: str = '', 399 | dont_overwrite: bool = False) -> None: 400 | """Uploads file to remote instance. If location not specified, dumps it 401 | into default directory. If remote location has files or directories with the 402 | same name, behavior is undefined.""" 403 | 404 | # support wildcard through glob 405 | if '*' in local_fn: 406 | for local_subfn in glob.glob(local_fn): 407 | self.upload(local_subfn) 408 | return 409 | 410 | if '#' in local_fn: # hashes also give problems from shell commands 411 | self.log("skipping backup file {local_fn}") 412 | return 413 | 414 | if not self.sftp: 415 | self.sftp = u.call_with_retries(self.ssh_client.open_sftp, 416 | 'self.ssh_client.open_sftp') 417 | 418 | def maybe_fix_mode(local_fn_, remote_fn_): 419 | """Makes remote file execute for locally executable files""" 420 | mode = oct(os.stat(local_fn_)[stat.ST_MODE])[-3:] 421 | if '7' in mode: 422 | self.log(f"Making {remote_fn_} executable with mode {mode}") 423 | # use raw run, in case tmux is unavailable 424 | self._run_raw(f"chmod {mode} {remote_fn_}") 425 | 426 | # augmented SFTP client that can transfer directories, from 427 | # https://stackoverflow.com/a/19974994/419116 428 | def _put_dir(source, target): 429 | """ Uploads the contents of the source directory to the target path.""" 430 | 431 | def _safe_mkdir(path, mode=511, ignore_existing=True): 432 | """ Augments mkdir by adding an option to not fail if the folder exists asdf asdf asdf as""" 433 | try: 434 | self.sftp.mkdir(path, mode) 435 | except IOError: 436 | if ignore_existing: 437 | pass 438 | else: 439 | raise 440 | 441 | assert os.path.isdir(source) 442 | _safe_mkdir(target) 443 | 444 | for item in os.listdir(source): 445 | if os.path.isfile(os.path.join(source, item)): 446 | self.sftp.put(os.path.join(source, item), os.path.join(target, item)) 447 | maybe_fix_mode(os.path.join(source, item), os.path.join(target, item)) 448 | else: 449 | _safe_mkdir(f'{target}/{item}') 450 | _put_dir(f'{source}/{item}', f'{target}/{item}') 451 | 452 | if not remote_fn: 453 | remote_fn = os.path.basename(local_fn) 454 | 455 | self.log('uploading ' + local_fn + ' to ' + remote_fn) 456 | remote_fn = remote_fn.replace('~', self.homedir) 457 | 458 | if '/' in remote_fn: 459 | remote_dir = os.path.dirname(remote_fn) 460 | assert self.exists( 461 | remote_dir), f"Remote dir {remote_dir} doesn't exist" 462 | if dont_overwrite and self.exists(remote_fn): 463 | self.log("Remote file %s exists, skipping" % (remote_fn,)) 464 | return 465 | 466 | assert os.path.exists(local_fn), f"{local_fn} not found" 467 | if os.path.isdir(local_fn): 468 | _put_dir(local_fn, remote_fn) 469 | else: 470 | assert os.path.isfile(local_fn), "%s is not a file" % (local_fn,) 471 | # this crashes with IOError when upload failed 472 | if self.exists(remote_fn) and self.isdir(remote_fn): 473 | remote_fn = remote_fn + '/' + os.path.basename(local_fn) 474 | self.sftp.put(localpath=local_fn, remotepath=remote_fn) 475 | maybe_fix_mode(local_fn, remote_fn) 476 | 477 | def download(self, remote_fn, local_fn=''): 478 | self.log("downloading %s" % remote_fn) 479 | # sometimes open_sftp fails with Administratively prohibited, do retries 480 | # root cause could be too many SSH connections being open 481 | # https://unix.stackexchange.com/questions/14160/ssh-tunneling-error-channel-1-open-failed-administratively-prohibited-open 482 | if not self.sftp: 483 | self.sftp = u.call_with_retries(self.ssh_client.open_sftp, 484 | 'self.ssh_client.open_sftp') 485 | if not local_fn: 486 | local_fn = os.path.basename(remote_fn) 487 | self.log("downloading %s to %s" % (remote_fn, local_fn)) 488 | self.sftp.get(remote_fn, local_fn) 489 | 490 | def exists(self, remote_fn): 491 | stdout, stderr = self._run_raw('stat ' + remote_fn, ignore_errors=True) 492 | return 'No such file' not in stdout 493 | 494 | def write(self, remote_fn, contents): 495 | tmp_fn = self.local_scratch + '/' + str(util.now_micros()) 496 | open(tmp_fn, 'w').write(contents) 497 | self.upload(tmp_fn, remote_fn) 498 | 499 | def read(self, remote_fn): 500 | tmp_fn = self.local_scratch + '/' + str(util.now_micros()) 501 | self.download(remote_fn, tmp_fn) 502 | return open(tmp_fn).read() 503 | 504 | def isdir(self, remote_fn): 505 | stdout, _stderr = self._run_raw('ls -ld ' + remote_fn) 506 | return stdout.startswith('d') 507 | 508 | def switch_window(self, window_id: int): 509 | """ 510 | Switches currently active tmux window for given task. 0 is the default window 511 | Args: 512 | window_id: integer id of tmux window to use 513 | """ 514 | 515 | # windows are numbered sequentially 0, 1, 2, ... 516 | # create any missing windows and make them point to the same directory 517 | if window_id not in self.tmux_available_window_ids: 518 | for i in range(max(self.tmux_available_window_ids) + 1, window_id + 1): 519 | self._run_raw(f'tmux new-window -t {self.tmux_session} -d') 520 | self.tmux_available_window_ids.append(i) 521 | 522 | self.tmux_window_id = window_id 523 | 524 | 525 | @property 526 | def logdir(self): 527 | """Returns logging directory, creating one if necessary. See "Logdir" section 528 | of design doc on naming convention""" 529 | 530 | run_name = ncluster_globals.get_run_for_task(self) 531 | logdir = ncluster_globals.get_logdir(run_name) 532 | if logdir: 533 | return logdir 534 | 535 | # create logdir. Only single task in a group creates the logdir 536 | if ncluster_globals.is_chief(self, run_name): 537 | chief = self 538 | else: 539 | chief = ncluster_globals.get_chief(run_name) 540 | 541 | chief.setup_logdir() 542 | return ncluster_globals.get_logdir(run_name) 543 | 544 | # release lock 545 | 546 | def setup_logdir(self): 547 | # todo: locking on logdir creation 548 | 549 | """Create logdir for task/job/run 550 | """ 551 | run_name = ncluster_globals.get_run_for_task(self) 552 | self.log("Creating logdir for run " + run_name) 553 | logdir_root = ncluster_globals.LOGDIR_ROOT 554 | assert logdir_root 555 | 556 | self.run(f'mkdir -p {logdir_root}') 557 | find_command = f'find {logdir_root} -maxdepth 1 -type d' 558 | 559 | stdout, stderr = self.run_with_output(find_command) 560 | logdir = f"{logdir_root}/{run_name}" 561 | 562 | counter = 0 563 | while logdir in stdout: 564 | counter += 1 565 | new_logdir = f'{logdir_root}/{run_name}.{counter:02d}' 566 | self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}') 567 | logdir = new_logdir 568 | self.run(f'mkdir -p {logdir}') 569 | 570 | ncluster_globals.set_logdir(run_name, logdir) 571 | return logdir 572 | 573 | # legacy methods 574 | def file_exists(self, remote_fn): 575 | return self.exists(remote_fn) 576 | 577 | def file_write(self, *args, **kwargs): 578 | return self.write(*args, **kwargs) 579 | 580 | def file_read(self, remote_fn): 581 | return self.read(remote_fn) 582 | 583 | class Job(backend.Job): 584 | pass 585 | 586 | 587 | class Run(backend.Run): 588 | """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter 589 | server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and 590 | event files. 591 | :ivar aws_placement_group_name: somedoc 592 | """ 593 | placement_group: str # unique identifier to use as placement_group group name 594 | jobs: List[Job] 595 | 596 | def __init__(self, name='', **kwargs): 597 | """Creates a run. If install_script is specified, it's used as default 598 | install_script for all jobs (can be overridden by Job constructor)""" 599 | 600 | assert name, "Must specify name for current run" 601 | 602 | jobs = [] 603 | self.name = name 604 | self.jobs = jobs 605 | self.kwargs = kwargs 606 | util.log(f"Choosing placement_group for run {name}") 607 | self.placement_group = name + '-' + util.random_id() 608 | 609 | @property 610 | def logdir(self): 611 | # querying logdir has a side-effect of creation, so do it on chief task 612 | chief_task = ncluster_globals.get_chief(self.name) 613 | return chief_task.logdir 614 | 615 | # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods 616 | def run(self, *args, **kwargs): 617 | """Runs command on every job in the run.""" 618 | 619 | for job in self.jobs: 620 | job.run(*args, **kwargs) 621 | 622 | def run_with_output(self, *args, **kwargs): 623 | """Runs command on every first job in the run, returns stdout.""" 624 | for job in self.jobs: 625 | job.run_with_output(*args, **kwargs) 626 | 627 | def _run_raw(self, *args, **kwargs): 628 | """_run_raw on every job in the run.""" 629 | for job in self.jobs: 630 | job._run_raw(*args, **kwargs) 631 | 632 | def upload(self, *args, **kwargs): 633 | """Runs command on every job in the run.""" 634 | for job in self.jobs: 635 | job.upload(*args, **kwargs) 636 | 637 | def make_job(self, name='', **kwargs): 638 | return make_job(name+'.'+self.name, run_name=self.name, **kwargs) 639 | 640 | 641 | def make_task( 642 | name: str = '', 643 | run_name: str = '', 644 | install_script: str = '', 645 | instance_type: str = '', 646 | image_name: str = '', 647 | disk_size: int = 0, 648 | preemptible=None, 649 | logging_task: backend.Task = None, 650 | create_resources=True, 651 | spot=False 652 | ) -> Task: 653 | """ 654 | Create task on AWS. 655 | 656 | Automatically places it in singleton Run/singleton Job objects, see Run/Job/Task hierarchy for details 657 | https://docs.google.com/document/d/1Gg4T243cYrDUW1YDCikmqp7fzSQDU3rZxOkJr9ohhs8/edit#heading=h.j4td4oixogib 658 | 659 | 660 | Args: 661 | disk_size: default size of root disk, in GBs 662 | create_resources: whether this task will handle resource creation 663 | name: see ncluster.make_task 664 | run_name: see ncluster.make_task 665 | install_script: see ncluster.make_task 666 | instance_type: instance type to use, defaults to $NCLUSTER_INSTANCE or t3.micro if unset 667 | image_name: name of image, ie, "Deep Learning AMI (Ubuntu) Version 12.0", defaults to $NCLUSTER_IMAGE or amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2 if unset 668 | preemptible: use cheaper preemptible/spot instances 669 | logging_task: partially initialized Task object, use it for logging 670 | 671 | Returns: 672 | 673 | """ 674 | 675 | ncluster_globals.task_launched = True 676 | 677 | def log(*_args): 678 | if logging_task: 679 | logging_task.log(*_args) 680 | else: 681 | util.log(*_args) 682 | 683 | # if name not specified, use name which is the same across script invocations for given image/instance-type 684 | name = ncluster_globals.auto_assign_task_name_if_needed(name, instance_type, 685 | image_name) 686 | 687 | if not instance_type: 688 | instance_type = os.environ.get('NCLUSTER_INSTANCE', 't3.micro') 689 | log("Using instance " + instance_type) 690 | 691 | _set_aws_environment() 692 | if create_resources: 693 | _maybe_create_resources(logging_task=logging_task) 694 | else: 695 | pass 696 | 697 | run: Run = ncluster_globals.get_run_object(run_name) 698 | placement_group = '' 699 | if u.instance_supports_placement_groups(instance_type) and run: 700 | placement_group = run.placement_group 701 | log(f"Launching into placement_group group {placement_group}") 702 | u.maybe_create_placement_group(run.placement_group) 703 | 704 | if not image_name: 705 | image_name = os.environ.get('NCLUSTER_IMAGE', 706 | GENERIC_SMALL_IMAGE) 707 | log("Using image " + image_name) 708 | 709 | if preemptible is None: 710 | preemptible = os.environ.get('NCLUSTER_PREEMPTIBLE', False) 711 | preemptible = bool(preemptible) 712 | if preemptible: 713 | log("Using preemptible instances") 714 | 715 | image = u.lookup_image(image_name) 716 | keypair = u.get_keypair() 717 | security_group = u.get_security_group() 718 | ec2 = u.get_ec2_resource() 719 | 720 | instance = u.lookup_instance(name, instance_type, 721 | image_name) 722 | _maybe_start_instance(instance) 723 | _maybe_wait_for_initializing_instance(instance) 724 | 725 | # create the instance if not present 726 | if instance: 727 | log(f"Reusing {instance}") 728 | else: 729 | log(f"Allocating {instance_type} for task {name}") 730 | args = {'ImageId': image.id, 731 | 'InstanceType': instance_type, 732 | 'MinCount': 1, 733 | 'MaxCount': 1, 734 | 'SecurityGroupIds': [security_group.id], 735 | 'KeyName': keypair.name} 736 | 737 | args['TagSpecifications'] = [{ 738 | 'ResourceType': 'instance', 739 | 'Tags': [{ 740 | 'Key': 'Name', 741 | 'Value': name 742 | }] 743 | }] 744 | 745 | # subnet = u.get_subnet() 746 | # args['NetworkInterfaces'] = [{'SubnetId': subnet.id, 747 | # 'DeviceIndex': 0, 748 | # 'AssociatePublicIpAddress': True, 749 | # 'Groups': [security_group.id]}] 750 | # placement_specs = {'AvailabilityZone': u.get_zone()} 751 | 752 | placement_specs = {} 753 | if placement_group: 754 | placement_specs['GroupName'] = placement_group 755 | 756 | args['Placement'] = placement_specs 757 | args['Monitoring'] = {'Enabled': True} 758 | 759 | if disk_size: 760 | assert disk_size > 0 761 | ebs = { 762 | 'VolumeSize': disk_size, 763 | 'VolumeType': 'gp2', 764 | } 765 | 766 | args['BlockDeviceMappings'] = [{ 767 | 'DeviceName': '/dev/sda1', 768 | 'Ebs': ebs 769 | }] 770 | 771 | # Use high throughput disk (0.065/iops-month = about $1/hour) 772 | if 'NCLUSTER_AWS_FAST_ROOTDISK' in os.environ: 773 | assert not disk_size, f"Specified both disk_size {disk_size} and $NCLUSTER_AWS_FAST_ROOTDISK, they are incompatible as $NCLUSTER_AWS_FAST_ROOTDISK hardwired disk size" 774 | 775 | ebs = { 776 | 'VolumeSize': 500, 777 | 'VolumeType': 'io1', 778 | 'Iops': 11500 779 | } 780 | 781 | args['BlockDeviceMappings'] = [{ 782 | 'DeviceName': '/dev/sda1', 783 | 'Ebs': ebs 784 | }] 785 | 786 | instances = [] 787 | try: 788 | if spot: 789 | instances = u.create_spot_instances(args) 790 | else: 791 | instances = ec2.create_instances(**args) 792 | except Exception as e: 793 | log(f"Instance creation for {name} failed with ({e})") 794 | log( 795 | "You can change availability zone using export NCLUSTER_ZONE=...") 796 | log("Terminating") 797 | os.kill(os.getpid(), 798 | signal.SIGINT) # sys.exit() doesn't work inside thread 799 | 800 | assert instances, f"ec2.create_instances returned {instances}" 801 | log(f"Allocated {len(instances)} instances") 802 | instance = instances[0] 803 | 804 | task = Task(name, instance=instance, 805 | install_script=install_script, 806 | image_name=image_name, 807 | instance_type=instance_type) 808 | 809 | ncluster_globals.register_task(task, run_name) 810 | return task 811 | 812 | 813 | def make_job( 814 | name: str = '', 815 | run_name: str = '', 816 | num_tasks: int = 1, 817 | install_script: str = '', 818 | instance_type: str = '', 819 | image_name: str = '', 820 | create_resources=True, 821 | **kwargs) -> Job: 822 | """ 823 | Args: 824 | create_resources: if True, will create resources if necessary 825 | name: see backend.make_task 826 | run_name: see backend.make_task 827 | num_tasks: number of tasks to launch 828 | install_script: see make_task 829 | instance_type: see make_task 830 | image_name: see make_task 831 | 832 | Returns: 833 | 834 | """ 835 | assert num_tasks > 0, f"Can't create job with {num_tasks} tasks" 836 | assert name.count( 837 | '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for convention)" 838 | 839 | # dummy tasks for logging 840 | tasks = [backend.Task(f"{i}.{name}") for i in range(num_tasks)] 841 | 842 | _set_aws_environment(tasks[0]) 843 | if create_resources: 844 | _maybe_create_resources(tasks[0]) 845 | 846 | name = ncluster_globals.auto_assign_job_name_if_needed(name) 847 | run_name = ncluster_globals.auto_assign_run_name_if_needed(run_name) 848 | _run = ncluster_globals.create_run_if_needed(run_name, make_run) 849 | 850 | job = Job(name=name, tasks=tasks, run_name=run_name, **kwargs) 851 | 852 | exceptions = [] 853 | 854 | # make tasks in parallel 855 | def make_task_fn(i: int): 856 | try: 857 | tasks[i] = make_task(f"{i}.{name}", run_name=run_name, 858 | install_script=install_script, 859 | instance_type=instance_type, image_name=image_name, 860 | logging_task=tasks[i], 861 | create_resources=False, 862 | # handle resources in job already 863 | **kwargs) 864 | except Exception as e: 865 | exceptions.append(e) 866 | 867 | util.log("Creating threads") 868 | threads = [threading.Thread(name=f'make_task_{i}', 869 | target=make_task_fn, args=[i]) 870 | for i in range(num_tasks)] 871 | for thread in threads: 872 | thread.start() 873 | for thread in threads: 874 | thread.join() 875 | print("Exception are ", exceptions) 876 | if exceptions: 877 | raise exceptions[0] 878 | 879 | job.tasks = tasks 880 | 881 | # double check that all instances are in the same placement_group group 882 | # this can happen if some instances from previous smaller run are getting reused 883 | placement_dict = {task.instance.placement_group: task.name for task in 884 | job.tasks} 885 | # TODO: make placement_group group name derived from run, to make it deterministic 886 | # on individual instance restarts 887 | if len(placement_dict) > 1: 888 | util.log("Job tasks are spread over multiple placement_group groups") 889 | pprint.pprint(placement_dict) 890 | raise RuntimeError( 891 | f"Got instance spread over multiple placement_group groups: {placement_dict}. Must terminate all instances in run {run_name} and try again.") 892 | return job 893 | 894 | 895 | def make_run(name) -> Run: 896 | run = Run(name) 897 | ncluster_globals.register_run(name, run) 898 | return run 899 | 900 | 901 | # TODO: this method and a few others are backend specific, document in API doc 902 | def _maybe_start_instance(instance): 903 | """Starts instance if it's stopped, no-op otherwise.""" 904 | 905 | if not instance: 906 | return 907 | 908 | if instance.state['Name'] == 'stopped': 909 | instance.start() 910 | while True: 911 | print(f"Waiting for {instance} to start.") 912 | instance.reload() 913 | if instance.state['Name'] == 'running': 914 | break 915 | time.sleep(10) 916 | 917 | 918 | def _maybe_wait_for_initializing_instance(instance): 919 | """Starts instance if it's stopped, no-op otherwise.""" 920 | 921 | if not instance: 922 | return 923 | 924 | if instance.state['Name'] == 'initializing': 925 | while True: 926 | print(f"Waiting for {instance} to leave state 'initializing'.") 927 | instance.reload() 928 | if instance.state['Name'] == 'running': 929 | break 930 | time.sleep(10) 931 | 932 | 933 | def _maybe_create_resources(logging_task: Task = None): 934 | """Use heuristics to decide to possibly create resources""" 935 | 936 | def log(*args): 937 | if logging_task: 938 | logging_task.log(*args) 939 | else: 940 | util.log(*args) 941 | 942 | def should_create_resources(): 943 | """Check if gateway, keypair, vpc exist.""" 944 | prefix = u.get_prefix() 945 | if u.get_keypair_name() not in u.get_keypair_dict(): 946 | log(f"Missing {u.get_keypair_name()} keypair, creating resources") 947 | return True 948 | vpcs = u.get_vpc_dict() 949 | if prefix not in vpcs: 950 | log(f"Missing {prefix} vpc, creating resources") 951 | return True 952 | vpc = vpcs[prefix] 953 | gateways = u.get_gateway_dict(vpc) 954 | if prefix not in gateways: 955 | log(f"Missing {prefix} gateway, creating resources") 956 | return True 957 | return False 958 | 959 | try: 960 | # this locking is approximate, still possible for threads to slip through 961 | if os.path.exists(AWS_LOCK_FN): 962 | pid, ts, lock_taskname = open(AWS_LOCK_FN).read().split('-') 963 | ts = int(ts) 964 | log(f"waiting for aws resource creation, another resource initiation was " 965 | f"initiated {int(time.time()-ts)} seconds ago by " 966 | f"{lock_taskname}, delete lock file " 967 | f"{AWS_LOCK_FN} if this is an error") 968 | while True: 969 | if os.path.exists(AWS_LOCK_FN): 970 | log(f"waiting for lock file {AWS_LOCK_FN} to get deleted " 971 | f"initiated {int(time.time()-ts)} seconds ago by ") 972 | time.sleep(2) 973 | continue 974 | else: 975 | break 976 | return 977 | 978 | with open(AWS_LOCK_FN, 'w') as f: 979 | f.write( 980 | f'{os.getpid()}-{int(time.time())}-{logging_task.name if logging_task else ""}') 981 | 982 | if not should_create_resources(): 983 | util.log("Resources already created, no-op") 984 | os.remove(AWS_LOCK_FN) 985 | return 986 | 987 | create_lib.create_resources() 988 | finally: 989 | if os.path.exists(AWS_LOCK_FN): 990 | os.remove(AWS_LOCK_FN) 991 | 992 | 993 | def _set_aws_environment(task: Task = None): 994 | """Sets up AWS environment from NCLUSTER environment variables""" 995 | current_zone = os.environ.get('NCLUSTER_ZONE', '') 996 | current_region = os.environ.get('AWS_DEFAULT_REGION', '') 997 | 998 | def log(*args): 999 | if task: 1000 | task.log(*args) 1001 | else: 1002 | util.log(*args) 1003 | 1004 | if current_region and current_zone: 1005 | assert current_zone.startswith( 1006 | current_region), f'Current zone "{current_zone}" ($NCLUSTER_ZONE) is not ' \ 1007 | f'in current region "{current_region} ($AWS_DEFAULT_REGION)' 1008 | assert u.get_session().region_name == current_region # setting from ~/.aws 1009 | 1010 | # zone is set, set region from zone 1011 | if current_zone and not current_region: 1012 | current_region = current_zone[:-1] 1013 | os.environ['AWS_DEFAULT_REGION'] = current_region 1014 | 1015 | # neither zone nor region not set, use default setting for region 1016 | # if default is not set, use NCLUSTER_DEFAULT_REGION 1017 | if not current_region: 1018 | current_region = u.get_session().region_name 1019 | if not current_region: 1020 | log(f"No default region available, using {NCLUSTER_DEFAULT_REGION}") 1021 | current_region = NCLUSTER_DEFAULT_REGION 1022 | os.environ['AWS_DEFAULT_REGION'] = current_region 1023 | 1024 | # zone not set, use first zone of the region 1025 | # if not current_zone: 1026 | # current_zone = current_region + 'a' 1027 | # os.environ['NCLUSTER_ZONE'] = current_zone 1028 | 1029 | log(f"Using account {u.get_account_number()}, region {current_region}, " 1030 | f"zone {current_zone}") 1031 | 1032 | -------------------------------------------------------------------------------- /ncluster/aws_create_resources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Creates resources 4 | # This script creates VPC/security group/keypair if not already present 5 | 6 | import os 7 | import sys 8 | import time 9 | 10 | from ncluster import aws_util as u 11 | from ncluster import util 12 | 13 | DRYRUN = False 14 | DEBUG = True 15 | 16 | # Names of Amazon resources that are created. These settings are fixed across 17 | # all runs, and correspond to resources created once per user per region. 18 | 19 | PUBLIC_TCP_RANGES = [ 20 | 22, # ssh 21 | # ipython notebook ports 22 | (8888, 8899), 23 | # redis port 24 | 6379, 25 | # tensorboard ports 26 | (6006, 6016) 27 | ] 28 | 29 | PUBLIC_UDP_RANGES = [(60000, 61000)] # mosh ports 30 | 31 | 32 | # TODO: this creates a custom VPC, but we are using default VPC, so have two security groups 33 | # once we are sure we don't need custom VPC, can get rid of extra VPC creation 34 | 35 | def network_setup(): 36 | """Creates VPC if it doesn't already exists, configures it for public 37 | internet access, returns vpc, subnet, security_group""" 38 | 39 | # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 40 | 41 | ec2 = u.get_ec2_resource() 42 | client = u.get_ec2_client() 43 | existing_vpcs = u.get_vpc_dict() 44 | zones = u.get_zones() 45 | 46 | # create VPC from scratch. Remove this if default VPC works well enough. 47 | vpc_name = u.get_vpc_name() 48 | if u.get_vpc_name() in existing_vpcs: 49 | print("Reusing VPC " + vpc_name) 50 | vpc = existing_vpcs[vpc_name] 51 | subnets = list(vpc.subnets.all()) 52 | assert len(subnets) == len( 53 | zones), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( 54 | len(subnets), len(zones)) 55 | 56 | else: 57 | print("Creating VPC " + vpc_name) 58 | vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') 59 | 60 | # enable DNS on the VPC 61 | response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) 62 | assert u.is_good_response(response) 63 | response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) 64 | assert u.is_good_response(response) 65 | 66 | vpc.create_tags(Tags=u.create_name_tags(vpc_name)) 67 | vpc.wait_until_available() 68 | 69 | gateways = u.get_gateway_dict(vpc) 70 | gateway_name = u.get_gateway_name() 71 | if gateway_name in gateways: 72 | print("Reusing gateways " + gateway_name) 73 | else: 74 | print("Creating internet gateway " + gateway_name) 75 | ig = ec2.create_internet_gateway() 76 | ig.attach_to_vpc(VpcId=vpc.id) 77 | ig.create_tags(Tags=u.create_name_tags(gateway_name)) 78 | 79 | # check that attachment succeeded 80 | attach_state = u.extract_attr_for_match(ig.attachments, State=-1, 81 | VpcId=vpc.id) 82 | assert attach_state == 'available', "vpc %s is in state %s" % (vpc.id, 83 | attach_state) 84 | 85 | route_table = vpc.create_route_table() 86 | route_table_name = u.get_route_table_name() 87 | route_table.create_tags(Tags=u.create_name_tags(route_table_name)) 88 | 89 | dest_cidr = '0.0.0.0/0' 90 | route_table.create_route( 91 | DestinationCidrBlock=dest_cidr, 92 | GatewayId=ig.id 93 | ) 94 | # check success 95 | for route in route_table.routes: 96 | # result looks like this 97 | # ec2.Route(route_table_id='rtb-a8b438cf', 98 | # destination_cidr_block='0.0.0.0/0') 99 | if route.destination_cidr_block == dest_cidr: 100 | break 101 | else: 102 | # sometimes get 103 | # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] 104 | # TODO: add a wait/retry? 105 | assert False, "Route for %s not found in %s" % (dest_cidr, 106 | route_table.routes) 107 | 108 | assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 109 | ip = 0 110 | for zone in zones: 111 | cidr_block = '192.168.%d.0/20' % (ip,) 112 | ip += 16 113 | print("Creating subnet %s in zone %s" % (cidr_block, zone)) 114 | subnet = vpc.create_subnet(CidrBlock=cidr_block, 115 | AvailabilityZone=zone) 116 | subnet.create_tags(Tags=[{'Key': 'Name', 'Value': f'{vpc_name}-subnet'}, 117 | {'Key': 'Region', 'Value': zone}]) 118 | response = client.modify_subnet_attribute( 119 | MapPublicIpOnLaunch={'Value': True}, 120 | SubnetId=subnet.id 121 | ) 122 | assert u.is_good_response(response) 123 | u.wait_until_available(subnet) 124 | assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?" 125 | 126 | route_table.associate_with_subnet(SubnetId=subnet.id) 127 | 128 | # Use default VPC from now on 129 | vpc = u.get_default_vpc() 130 | if not vpc: 131 | util.log(f"Creating default VPC for region {u.get_region()}") 132 | client.create_default_vpc() 133 | vpc = u.get_default_vpc() 134 | assert vpc, "Could not create default VPC?" 135 | 136 | existing_security_groups = u.get_security_group_dict() 137 | security_group_name = u.get_security_group_name() 138 | if security_group_name in existing_security_groups: 139 | print("Reusing security group " + security_group_name) 140 | security_group = existing_security_groups[security_group_name] 141 | assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \ 142 | f"attached to {security_group.vpc_id} but expected {vpc.id}" 143 | else: 144 | print("Creating security group " + security_group_name) 145 | security_group = ec2.create_security_group( 146 | GroupName=security_group_name, Description=security_group_name, 147 | VpcId=vpc.id) 148 | 149 | security_group.create_tags(Tags=u.create_name_tags(security_group_name)) 150 | 151 | # allow ICMP access for public ping 152 | security_group.authorize_ingress( 153 | CidrIp='0.0.0.0/0', 154 | IpProtocol='icmp', 155 | FromPort=-1, 156 | ToPort=-1 157 | ) 158 | 159 | # open public ports 160 | # always include SSH port which is required for basic functionality 161 | assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" 162 | for port in PUBLIC_TCP_RANGES: 163 | if util.is_iterable(port): 164 | assert len(port) == 2 165 | from_port, to_port = port 166 | else: 167 | from_port, to_port = port, port 168 | 169 | response = security_group.authorize_ingress(IpProtocol="tcp", 170 | CidrIp="0.0.0.0/0", 171 | FromPort=from_port, 172 | ToPort=to_port) 173 | assert u.is_good_response(response) 174 | 175 | for port in PUBLIC_UDP_RANGES: 176 | if util.is_iterable(port): 177 | assert len(port) == 2 178 | from_port, to_port = port 179 | else: 180 | from_port, to_port = port, port 181 | 182 | response = security_group.authorize_ingress(IpProtocol="udp", 183 | CidrIp="0.0.0.0/0", 184 | FromPort=from_port, 185 | ToPort=to_port) 186 | assert u.is_good_response(response) 187 | 188 | # allow ingress within security group 189 | # Authorizing ingress doesn't work with names in a non-default VPC, 190 | # so must use more complicated syntax 191 | # https://github.com/boto/boto3/issues/158 192 | response = {} 193 | for protocol in ['icmp']: 194 | try: 195 | rule = {'FromPort': -1, 196 | 'IpProtocol': protocol, 197 | 'IpRanges': [], 198 | 'PrefixListIds': [], 199 | 'ToPort': -1, 200 | 'UserIdGroupPairs': [{'GroupId': security_group.id}]} 201 | response = security_group.authorize_ingress(IpPermissions=[rule]) 202 | except Exception as e: 203 | if response['Error']['Code'] == 'InvalidPermission.Duplicate': 204 | print("Warning, got " + str(e)) 205 | else: 206 | assert False, "Failed while authorizing ingress with " + str(e) 207 | 208 | for protocol in ['tcp', 'udp']: 209 | try: 210 | rule = {'FromPort': 0, 211 | 'IpProtocol': protocol, 212 | 'IpRanges': [], 213 | 'PrefixListIds': [], 214 | 'ToPort': 65535, 215 | 'UserIdGroupPairs': [{'GroupId': security_group.id}]} 216 | response = security_group.authorize_ingress(IpPermissions=[rule]) 217 | except Exception as e: 218 | if response['Error']['Code'] == 'InvalidPermission.Duplicate': 219 | print("Warning, got " + str(e)) 220 | else: 221 | assert False, "Failed while authorizing ingress with " + str(e) 222 | 223 | return vpc, security_group 224 | 225 | 226 | def keypair_setup(): 227 | """Creates keypair if necessary, saves private key locally, returns contents 228 | of private key file.""" 229 | 230 | os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION) 231 | 232 | keypair_name = u.get_keypair_name() 233 | keypair = u.get_keypair_dict().get(keypair_name, None) 234 | keypair_fn = u.get_keypair_fn() 235 | if keypair: 236 | print("Reusing keypair " + keypair_name) 237 | # check that local pem file exists and is readable 238 | assert os.path.exists( 239 | keypair_fn), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % ( 240 | keypair_name, keypair_fn, keypair_name) 241 | keypair_contents = open(keypair_fn).read() 242 | assert len(keypair_contents) > 0 243 | else: 244 | print("Creating keypair " + keypair_name) 245 | ec2 = u.get_ec2_resource() 246 | assert not os.path.exists( 247 | keypair_fn), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % ( 248 | keypair_fn) 249 | keypair = ec2.create_key_pair(KeyName=keypair_name) 250 | 251 | open(keypair_fn, 'w').write(keypair.key_material) 252 | os.system('chmod 400 ' + keypair_fn) 253 | 254 | return keypair 255 | 256 | 257 | def placement_group_setup(group_name): 258 | """Creates placement_group group if necessary. Returns True if new placement_group 259 | group was created, False otherwise.""" 260 | 261 | existing_placement_groups = u.get_placement_group_dict() 262 | 263 | group = existing_placement_groups.get(group_name, None) 264 | if group: 265 | assert group.state == 'available' 266 | assert group.strategy == 'cluster' 267 | print("Reusing group ", group.name) 268 | return group 269 | 270 | print("Creating group " + group_name) 271 | ec2 = u.get_ec2_resource() 272 | group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster') 273 | return group 274 | 275 | 276 | def create_resources(): 277 | print(f"Creating {u.get_prefix()} resources in region {u.get_region()}") 278 | 279 | vpc, security_group = network_setup() 280 | keypair_setup() # saves private key locally to keypair_fn 281 | 282 | # create EFS 283 | efss = u.get_efs_dict() 284 | efs_name = u.get_efs_name() 285 | efs_id = efss.get(efs_name, '') 286 | if not efs_id: 287 | print("Creating EFS " + efs_name) 288 | efs_id = u.create_efs(efs_name) 289 | else: 290 | print("Reusing EFS " + efs_name) 291 | 292 | efs_client = u.get_efs_client() 293 | 294 | # create mount target for each subnet in the VPC 295 | 296 | # added retries because efs is not immediately available 297 | max_failures = 10 298 | retry_interval_sec = 1 299 | for subnet in vpc.subnets.all(): 300 | for retry_attempt in range(max_failures): 301 | try: 302 | sys.stdout.write( 303 | "Creating efs mount target for %s ... " % (subnet.availability_zone,)) 304 | sys.stdout.flush() 305 | response = efs_client.create_mount_target(FileSystemId=efs_id, 306 | SubnetId=subnet.id, 307 | SecurityGroups=[ 308 | security_group.id]) 309 | if u.is_good_response(response): 310 | print("success") 311 | break 312 | except Exception as e: 313 | if 'already exists' in str(e): # ignore "already exists" errors 314 | print('already exists') 315 | break 316 | 317 | # Takes couple of seconds for EFS to come online, with 318 | # errors like this: 319 | # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec 320 | 321 | print("Got %s, retrying in %s sec" % (str(e), retry_interval_sec)) 322 | time.sleep(retry_interval_sec) 323 | else: 324 | print("Giving up.") 325 | 326 | 327 | if __name__ == '__main__': 328 | create_resources() 329 | -------------------------------------------------------------------------------- /ncluster/aws_delete_resources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Deletes resources 4 | 5 | import sys 6 | import os 7 | import argparse 8 | 9 | from ncluster import aws_util as u 10 | from ncluster import util 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--kind', type=str, default='all', 14 | help="which resources to delete, all/network/keypair/efs") 15 | parser.add_argument('--force-delete-efs', action='store_true', 16 | help="force deleting main EFS") 17 | args = parser.parse_args() 18 | 19 | EFS_NAME = u.get_prefix() 20 | VPC_NAME = u.get_prefix() 21 | SECURITY_GROUP_NAME = u.get_prefix() 22 | ROUTE_TABLE_NAME = u.get_prefix() 23 | KEYPAIR_NAME = u.get_keypair_name() 24 | 25 | client = u.get_ec2_client() 26 | ec2 = u.get_ec2_resource() 27 | 28 | 29 | def response_type(response): 30 | return 'ok' if u.is_good_response(response) else 'failed' 31 | 32 | 33 | def delete_efs(): 34 | efss = u.get_efs_dict() 35 | efs_id = efss.get(EFS_NAME, '') 36 | efs_client = u.get_efs_client() 37 | if efs_id: 38 | try: 39 | # delete mount targets first 40 | print("About to delete %s (%s)" % (efs_id, EFS_NAME)) 41 | response = efs_client.describe_mount_targets(FileSystemId=efs_id) 42 | assert u.is_good_response(response) 43 | for mount_response in response['MountTargets']: 44 | id_ = mount_response['MountTargetId'] 45 | sys.stdout.write('Deleting mount target %s ... ' % (id_,)) 46 | sys.stdout.flush() 47 | response = efs_client.delete_mount_target(MountTargetId=id_) 48 | print(response_type(response)) 49 | 50 | sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, EFS_NAME)) 51 | sys.stdout.flush() 52 | u.delete_efs_by_id(efs_id) 53 | 54 | except Exception as e: 55 | sys.stdout.write(f'failed with {e}\n') 56 | util.log_error(str(e) + '\n') 57 | 58 | 59 | def delete_network(): 60 | existing_vpcs = u.get_vpc_dict() 61 | if VPC_NAME in existing_vpcs: 62 | vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id) 63 | print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) 64 | 65 | for subnet in vpc.subnets.all(): 66 | try: 67 | sys.stdout.write("Deleting subnet %s ... " % subnet.id) 68 | sys.stdout.write(response_type(subnet.delete()) + '\n') 69 | except Exception as e: 70 | sys.stdout.write('failed\n') 71 | util.log_error(str(e) + '\n') 72 | 73 | for gateway in vpc.internet_gateways.all(): 74 | sys.stdout.write("Deleting gateway %s ... " % gateway.id) 75 | # note: if instances are using VPC, this fails with 76 | # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. 77 | 78 | sys.stdout.write('detached ... ' if u.is_good_response( 79 | gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') 80 | sys.stdout.write('deleted ' if u.is_good_response( 81 | gateway.delete()) else ' delete_failed ') 82 | sys.stdout.write('\n') 83 | 84 | def desc(): 85 | return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) 86 | 87 | for route_table in vpc.route_tables.all(): 88 | sys.stdout.write(f"Deleting route table {desc()} ... ") 89 | try: 90 | sys.stdout.write(response_type(route_table.delete()) + '\n') 91 | except Exception as e: 92 | sys.stdout.write('failed\n') 93 | util.log_error(str(e) + '\n') 94 | 95 | def desc(): 96 | return "%s (%s, %s)" % ( 97 | security_group.id, u.get_name(security_group.tags), 98 | security_group.group_name) 99 | 100 | for security_group in vpc.security_groups.all(): 101 | # default group is undeletable, skip 102 | if security_group.group_name == 'default': 103 | continue 104 | sys.stdout.write( 105 | 'Deleting security group %s ... ' % (desc())) 106 | try: 107 | sys.stdout.write(response_type(security_group.delete()) + '\n') 108 | except Exception as e: 109 | sys.stdout.write('failed\n') 110 | util.log_error(str(e) + '\n') 111 | 112 | sys.stdout.write("Deleting VPC %s ... " % vpc.id) 113 | try: 114 | sys.stdout.write(response_type(vpc.delete()) + '\n') 115 | except Exception as e: 116 | sys.stdout.write('failed\n') 117 | util.log_error(str(e) + '\n') 118 | 119 | 120 | def delete_keypair(): 121 | keypairs = u.get_keypair_dict() 122 | keypair = keypairs.get(KEYPAIR_NAME, '') 123 | if keypair: 124 | try: 125 | sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, 126 | KEYPAIR_NAME)) 127 | sys.stdout.write(response_type(keypair.delete()) + '\n') 128 | except Exception as e: 129 | sys.stdout.write('failed\n') 130 | util.log_error(str(e) + '\n') 131 | 132 | keypair_fn = u.get_keypair_fn() 133 | if os.path.exists(keypair_fn): 134 | print("Deleting local keypair file %s" % (keypair_fn,)) 135 | os.system('rm -f ' + keypair_fn) 136 | 137 | 138 | def delete_resources(force_delete_efs=False): 139 | region = os.environ['AWS_DEFAULT_REGION'] 140 | 141 | resource = u.get_prefix() 142 | print(f"Deleting {resource} resources in region {region}") 143 | print(f"Make sure {resource} instances are terminated or this will fail.") 144 | 145 | if 'efs' in args.kind or 'all' in args.kind: 146 | if EFS_NAME == u.DEFAULT_PREFIX and not force_delete_efs: 147 | # this is default EFS, likely has stuff, require extra flag to delete it 148 | print("default EFS has useful stuff in it, not deleting it. Use force-delete-efs " 149 | "flag to force") 150 | else: 151 | delete_efs() 152 | if 'network' in args.kind or 'all' in args.kind: 153 | delete_network() 154 | if 'keypair' in args.kind or 'all' in args.kind: 155 | delete_keypair() 156 | 157 | 158 | if __name__ == '__main__': 159 | delete_resources(force_delete_efs=args.force_delete_efs) 160 | -------------------------------------------------------------------------------- /ncluster/aws_util.py: -------------------------------------------------------------------------------- 1 | """Methods used in aws_backend, but also useful for standalone prototyping in Jupyter""" 2 | 3 | import os 4 | import re 5 | import sys 6 | import time 7 | from collections import Iterable 8 | from collections import OrderedDict 9 | import paramiko 10 | from operator import itemgetter 11 | 12 | 13 | import boto3 14 | 15 | from . import util 16 | 17 | EMPTY_NAME = "noname" # name to use when name attribute is missing on AWS 18 | RETRY_INTERVAL_SEC = 1 # how long to wait before retries 19 | RETRY_TIMEOUT_SEC = 30 # how long to wait before retrying fails 20 | DEFAULT_PREFIX = 'ncluster' 21 | PRIVATE_KEY_LOCATION = os.environ['HOME'] + '/.ncluster' 22 | DUPLICATE_CHECKING = False 23 | 24 | 25 | # Can't annotate boto3 return types because they are missing stubs 26 | # https://github.com/boto/boto3/issues/1055 27 | # https://stackoverflow.com/questions/52087307/adding-type-hinting-to-functions-that-return-boto3-objects 28 | 29 | def get_vpc(): 30 | """ 31 | Returns current VPC (ec2.Vpc object) 32 | https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#vpc 33 | """ 34 | 35 | return get_vpc_dict()[get_prefix()] 36 | 37 | 38 | def get_security_group(): 39 | """ 40 | Returns current security group, ec2.SecurityGroup object 41 | https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#securitygroup 42 | """ 43 | return get_security_group_dict()[get_prefix()] 44 | 45 | 46 | def get_subnet(): 47 | return get_subnet_dict()[get_zone()] 48 | 49 | 50 | def get_vpc_dict(): 51 | """Returns dictionary of named VPCs {name: vpc} 52 | 53 | Assert fails if there's more than one VPC with same name.""" 54 | 55 | client = get_ec2_client() 56 | response = client.describe_vpcs() 57 | assert is_good_response(response) 58 | 59 | result = OrderedDict() 60 | ec2 = get_ec2_resource() 61 | for vpc_response in response['Vpcs']: 62 | key = get_name(vpc_response.get('Tags', [])) 63 | if not key or key == EMPTY_NAME: # skip VPC's that don't have a name assigned 64 | continue 65 | 66 | if key in result: 67 | util.log(f"Warning: Duplicate VPC group {key} in {response}") 68 | if DUPLICATE_CHECKING: 69 | assert False 70 | result[key] = ec2.Vpc(vpc_response['VpcId']) 71 | 72 | return result 73 | 74 | 75 | def get_default_vpc(): 76 | """ 77 | Return default VPC or none if not present 78 | 79 | """ 80 | ec2 = get_ec2_resource() 81 | for vpc in ec2.vpcs.all(): 82 | if vpc.is_default: 83 | return vpc 84 | 85 | 86 | def get_subnet_dict(): 87 | """Returns dictionary of "availability zone" -> subnet for current VPC.""" 88 | subnet_dict = {} 89 | vpc = get_vpc() 90 | for subnet in vpc.subnets.all(): 91 | zone = subnet.availability_zone 92 | assert zone not in subnet_dict, "More than one subnet in %s, why?" % (zone,) 93 | subnet_dict[zone] = subnet 94 | return subnet_dict 95 | 96 | 97 | def get_gateway_dict(vpc): 98 | """Returns dictionary of named gateways for given VPC {name: gateway}""" 99 | return {get_name(gateway): gateway for 100 | gateway in vpc.internet_gateways.all()} 101 | 102 | 103 | def get_efs_dict(): 104 | """Returns dictionary of {efs_name: efs_id}""" 105 | # there's no EC2 resource for EFS objects, so return EFS_ID instead 106 | # https://stackoverflow.com/questions/47870342/no-ec2-resource-for-efs-objects 107 | 108 | efs_client = get_efs_client() 109 | response = call_with_retries(efs_client.describe_file_systems, 110 | 'efs_client.describe_file_systems') 111 | assert is_good_response(response) 112 | result = OrderedDict() 113 | for efs_response in response['FileSystems']: 114 | fs_id = efs_response['FileSystemId'] 115 | 116 | tag_response = call_with_retries(efs_client.describe_tags, 117 | "efs_client.describe_tags", 118 | FileSystemId=fs_id, retry_interval_sec=2) 119 | assert is_good_response(tag_response) 120 | key = get_name(tag_response['Tags']) 121 | if not key or key == EMPTY_NAME: # skip EFS's without a name 122 | continue 123 | assert key not in result 124 | result[key] = fs_id 125 | 126 | return result 127 | 128 | 129 | def get_placement_group_dict(): 130 | """Returns dictionary of {placement_group_name: (state, strategy)}""" 131 | 132 | client = get_ec2_client() 133 | response = client.describe_placement_groups() 134 | assert is_good_response(response) 135 | 136 | result = OrderedDict() 137 | ec2 = get_ec2_resource() 138 | for placement_group_response in response['PlacementGroups']: 139 | key = placement_group_response['GroupName'] 140 | if key in result: 141 | util.log(f"Warning: Duplicate placement_group group {key}") 142 | if DUPLICATE_CHECKING: 143 | assert False 144 | result[key] = ec2.PlacementGroup(key) 145 | return result 146 | 147 | 148 | def get_security_group_dict(): 149 | """Returns dictionary of named security groups {name: securitygroup}.""" 150 | 151 | client = get_ec2_client() 152 | response = client.describe_security_groups() 153 | assert is_good_response(response) 154 | 155 | result = OrderedDict() 156 | ec2 = get_ec2_resource() 157 | for security_group_response in response['SecurityGroups']: 158 | key = get_name(security_group_response.get('Tags', [])) 159 | if not key or key == EMPTY_NAME: 160 | continue # ignore unnamed security groups 161 | # key = security_group_response['GroupName'] 162 | if key in result: 163 | util.log(f"Warning: Duplicate security group {key}") 164 | if DUPLICATE_CHECKING: 165 | assert key not in result, ("Duplicate security group " + key) 166 | result[key] = ec2.SecurityGroup(security_group_response['GroupId']) 167 | 168 | return result 169 | 170 | 171 | def get_keypair_dict(): 172 | """Returns dictionary of {keypairname: keypair}""" 173 | 174 | client = get_ec2_client() 175 | response = client.describe_key_pairs() 176 | assert is_good_response(response) 177 | 178 | result = {} 179 | ec2 = get_ec2_resource() 180 | for keypair in response['KeyPairs']: 181 | keypair_name = keypair.get('KeyName', '') 182 | if keypair_name in result: 183 | util.log(f"Warning: Duplicate key {keypair_name}") 184 | if DUPLICATE_CHECKING: 185 | assert keypair_name not in result, "Duplicate key " + keypair_name 186 | result[keypair_name] = ec2.KeyPair(keypair_name) 187 | return result 188 | 189 | 190 | def get_prefix(): 191 | """Global prefix to identify ncluster created resources name used to identify ncluster created resources, 192 | (name of EFS, VPC, keypair prefixes), can be changed through $NCLUSTER_PREFIX for debugging purposes. """ 193 | 194 | name = os.environ.get('NCLUSTER_PREFIX', DEFAULT_PREFIX) 195 | if name != DEFAULT_PREFIX: 196 | validate_prefix(name) 197 | return name 198 | 199 | 200 | def get_account_number(): 201 | while True: 202 | try: 203 | return str(boto3.client('sts').get_caller_identity()['Account']) 204 | except Exception as e: 205 | util.log(f'Exception in get_account_number {e}, retrying') 206 | if 'AWS_SECRET_ACCESS_KEY' not in os.environ: 207 | util.log( 208 | 'AWS_SECRET_ACCESS_KEY not in env vars, configure your AWS credentials."') 209 | time.sleep(RETRY_INTERVAL_SEC) 210 | 211 | 212 | def get_region(): 213 | return get_session().region_name 214 | 215 | 216 | def get_zone() -> str: 217 | """Returns current zone, or empty string if it's unset.""" 218 | return os.environ.get('NCLUSTER_ZONE', '') 219 | 220 | 221 | def get_zones(): 222 | client = get_ec2_client() 223 | response = client.describe_availability_zones() 224 | assert is_good_response(response) 225 | zones = [] 226 | for avail_response in response['AvailabilityZones']: 227 | messages = avail_response['Messages'] 228 | zone = avail_response['ZoneName'] 229 | state = avail_response['State'] 230 | assert not messages, f"zone {zone} is broken? Has messages {messages}" 231 | assert state == 'available', f"zone {zone} is broken? Has state {state}" 232 | zones.append(zone) 233 | return zones 234 | 235 | 236 | def get_session(): 237 | # in future can add aws profile support with Session(profile_name=...) 238 | return boto3.Session() 239 | 240 | 241 | ################################################################################ 242 | # keypairs 243 | ################################################################################ 244 | # For naming conventions, see 245 | # https://docs.google.com/document/d/14-zpee6HMRYtEfQ_H_UN9V92bBQOt0pGuRKcEJsxLEA/edit#heading=h.45ok0839c0a 246 | 247 | def get_keypair_name(): 248 | """Returns current keypair name.""" 249 | 250 | username = get_username() 251 | assert '-' not in username, "username must not contain -, change $USER" 252 | validate_aws_name(username) 253 | assert len(username) < 30 # to avoid exceeding AWS 127 char limit 254 | return get_prefix() + '-' + username 255 | 256 | 257 | def get_keypair(): 258 | """Returns current keypair (ec2.KeyPairInfo) 259 | 260 | https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#keypairinfo 261 | """ 262 | 263 | return get_keypair_dict()[get_keypair_name()] 264 | 265 | 266 | def get_keypair_fn(): 267 | """Location of .pem file for current keypair""" 268 | 269 | keypair_name = get_keypair_name() 270 | account = get_account_number() 271 | region = get_region() 272 | fn = f'{PRIVATE_KEY_LOCATION}/{keypair_name}-{account}-{region}.pem' 273 | return fn 274 | 275 | 276 | def get_vpc_name(): 277 | return get_prefix() 278 | 279 | 280 | def get_security_group_name(): 281 | # We have two security groups, ncluster for manually created VPC and 282 | # ncluster-default for default VPC. Once default VPC works for all cases, can 283 | # get rid of one of security groups 284 | return get_prefix() 285 | 286 | 287 | def get_gateway_name(): 288 | return get_prefix() 289 | 290 | 291 | def get_route_table_name(): 292 | return get_prefix() 293 | 294 | 295 | def get_efs_name(): 296 | return get_prefix() 297 | 298 | 299 | def get_username(): 300 | assert 'USER' in os.environ, "why isn't USER defined?" 301 | return os.environ['USER'] 302 | 303 | 304 | def lookup_image(wildcard): 305 | """Returns unique ec2.Image whose name matches wildcard 306 | lookup_ami('pytorch*').name => ami-29fa 307 | 308 | https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#image 309 | 310 | Assert fails if multiple images match or no images match. 311 | """ 312 | 313 | ec2 = get_ec2_resource() 314 | filter_ = {'Name': 'name', 'Values': [wildcard]} 315 | 316 | images = list(ec2.images.filter(Filters=[filter_])) 317 | 318 | # Note, can add filtering by Owners as follows 319 | # images = list(ec2.images.filter_(Filters = [filter_], Owners=['self', 'amazon'])) 320 | 321 | assert len(images) <= 1, "Multiple images match " + str(wildcard) 322 | assert len(images) > 0, "No images match " + str(wildcard) 323 | return images[0] 324 | 325 | 326 | def lookup_instance(name: str, instance_type: str = '', image_name: str = '', 327 | states: tuple = ('running', 'stopped', 'initializing')): 328 | """Looks up AWS instance for given instance name, like 329 | simple.worker. If no instance found in current AWS environment, returns None. """ 330 | 331 | ec2 = get_ec2_resource() 332 | 333 | instances = ec2.instances.filter( 334 | Filters=[{'Name': 'instance-state-name', 'Values': states}]) 335 | 336 | prefix = get_prefix() 337 | username = get_username() 338 | 339 | # look for an existing instance matching job, ignore instances launched 340 | # by different user or under different resource name 341 | result = [] 342 | for i in instances.all(): 343 | instance_name = get_name(i) 344 | if instance_name != name: 345 | continue 346 | 347 | seen_prefix, seen_username = parse_key_name(i.key_name) 348 | if prefix != seen_prefix: 349 | print(f"Found {name} launched under {seen_prefix}, ignoring") 350 | continue 351 | if username != seen_username: 352 | print(f"Found {name} launched by {seen_username}, ignoring") 353 | continue 354 | 355 | if instance_type: 356 | assert i.instance_type == instance_type, f"Found existing instance for job {name} but different instance type ({i.instance_type}) than requested ({instance_type}), terminate {name} first or use new task name." 357 | 358 | if image_name: 359 | assert i.image.name == image_name, f"Found existing instance for job {name} but launched with different image ({i.image.name}) than requested ({image_name}), terminate {name} first or use new task name." 360 | result.append(i) 361 | 362 | assert len(result) < 2, f"Found two instances with name {name}" 363 | if not result: 364 | return None 365 | else: 366 | return result[0] 367 | 368 | 369 | def ssh_to_task(task) -> paramiko.SSHClient: 370 | """Create ssh connection to task's machine 371 | 372 | returns Paramiko SSH client connected to host. 373 | 374 | """ 375 | 376 | username = task.ssh_username 377 | hostname = task.public_ip 378 | ssh_key_fn = get_keypair_fn() 379 | print(f"ssh -i {ssh_key_fn} {username}@{hostname}") 380 | pkey = paramiko.RSAKey.from_private_key_file(ssh_key_fn) 381 | 382 | ssh_client = paramiko.SSHClient() 383 | ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 384 | assert ssh_client 385 | 386 | counter = 1 387 | while True: 388 | try: 389 | ssh_client.connect(hostname=hostname, username=username, pkey=pkey) 390 | if counter % 11 == 0: # occasionally re-obtain public ip, machine could've gotten restarted 391 | hostname = task.public_ip 392 | break 393 | except Exception as e: 394 | print( 395 | f'{task.name}: Exception connecting to {hostname} via ssh (could be a timeout): {e}') 396 | time.sleep(RETRY_INTERVAL_SEC) 397 | 398 | return ssh_client 399 | 400 | 401 | def parse_key_name(keyname): 402 | """keyname => resource, username""" 403 | # Relies on resource name not containing -, validated in 404 | # validate_resource_name 405 | toks = keyname.split('-') 406 | if len(toks) != 2: 407 | return None, None # some other keyname not launched by nexus 408 | else: 409 | return toks 410 | 411 | 412 | aws_name_regexp = re.compile('^[a-zA-Z0-9+-=._:/@]*$') 413 | 414 | 415 | def validate_aws_name(name): 416 | """Validate resource name using AWS name restrictions from # http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html#tag-restrictions""" 417 | assert len(name) <= 127 418 | # disallow unicode characters to avoid pain 419 | assert name == name.encode('ascii').decode('ascii') 420 | assert aws_name_regexp.match(name) 421 | 422 | 423 | resource_regexp = re.compile('^[a-z0-9]+$') 424 | 425 | 426 | def validate_prefix(name): 427 | """Check that name is valid as substitute for default prefix. Since it's used in unix filenames, key names, be more conservative than AWS requirements, just allow 30 chars, lowercase only.""" 428 | assert len(name) <= 30 429 | assert resource_regexp.match(name) 430 | validate_aws_name(name) 431 | 432 | 433 | def validate_run_name(name): 434 | """Name used for run. Used as part of instance name, tmux session name.""" 435 | assert len(name) <= 30 436 | validate_aws_name(name) 437 | 438 | 439 | def create_name_tags(name): 440 | """Returns [{'Key': 'Name', 'Value': name}] """ 441 | return [{'Key': 'Name', 'Value': name}] 442 | 443 | 444 | def create_efs(name): 445 | efs_client = get_efs_client() 446 | token = str(int(time.time() * 1e6)) # epoch usec 447 | 448 | response = efs_client.create_file_system(CreationToken=token, 449 | PerformanceMode='generalPurpose') 450 | assert is_good_response(response) 451 | start_time = time.time() 452 | while True: 453 | try: 454 | 455 | response = efs_client.create_file_system(CreationToken=token, 456 | PerformanceMode='generalPurpose') 457 | assert is_good_response(response) 458 | time.sleep(RETRY_INTERVAL_SEC) 459 | except Exception as e: 460 | if 'FileSystemAlreadyExists' in str(e): 461 | break 462 | if response['Error']['Code'] == 'FileSystemAlreadyExists': 463 | break 464 | else: 465 | util.log_error(e) 466 | break 467 | 468 | if time.time() - start_time - RETRY_INTERVAL_SEC > RETRY_TIMEOUT_SEC: 469 | assert False, "Timeout exceeded creating EFS %s (%s)" % (token, name) 470 | 471 | time.sleep(RETRY_TIMEOUT_SEC) 472 | 473 | # find efs id from given token 474 | response = efs_client.describe_file_systems() 475 | assert is_good_response(response) 476 | fs_id = extract_attr_for_match(response['FileSystems'], FileSystemId=-1, 477 | CreationToken=token) 478 | response = efs_client.create_tags(FileSystemId=fs_id, 479 | Tags=create_name_tags(name)) 480 | assert is_good_response(response) 481 | 482 | # make sure EFS is now visible 483 | efs_dict = get_efs_dict() 484 | assert name in efs_dict 485 | return efs_dict[name] 486 | 487 | 488 | def delete_efs_by_id(efs_id): 489 | """Deletion sometimes fails, try several times.""" 490 | start_time = time.time() 491 | efs_client = get_efs_client() 492 | sys.stdout.write("deleting %s ... " % (efs_id,)) 493 | while True: 494 | try: 495 | response = efs_client.delete_file_system(FileSystemId=efs_id) 496 | if is_good_response(response): 497 | print("succeeded") 498 | break 499 | time.sleep(RETRY_INTERVAL_SEC) 500 | except Exception as e: 501 | print("Failed with %s" % (e,)) 502 | if time.time() - start_time - RETRY_INTERVAL_SEC < RETRY_TIMEOUT_SEC: 503 | print("Retrying in %s sec" % (RETRY_INTERVAL_SEC,)) 504 | time.sleep(RETRY_INTERVAL_SEC) 505 | else: 506 | print("Giving up") 507 | break 508 | 509 | 510 | def extract_attr_for_match(items, **kwargs): 511 | """Helper method to get attribute value for an item matching some criterion. 512 | Specify target criteria value as dict, with target attribute having value -1 513 | 514 | Example: 515 | to extract state of vpc matching given vpc id 516 | 517 | response = [{'State': 'available', 'VpcId': 'vpc-2bb1584c'}] 518 | extract_attr_for_match(response, State=-1, VpcId='vpc-2bb1584c') #=> 'available'""" 519 | 520 | # find the value of attribute to return 521 | query_arg = None 522 | for arg, value in kwargs.items(): 523 | if value == -1: 524 | assert query_arg is None, "Only single query arg (-1 valued) is allowed" 525 | query_arg = arg 526 | result = [] 527 | 528 | filterset = set(kwargs.keys()) 529 | for item in items: 530 | match = True 531 | assert filterset.issubset( 532 | item.keys()), "Filter set contained %s which was not in record %s" % ( 533 | filterset.difference(item.keys()), 534 | item) 535 | for arg in item: 536 | if arg == query_arg: 537 | continue 538 | if arg in kwargs: 539 | if item[arg] != kwargs[arg]: 540 | match = False 541 | break 542 | if match: 543 | result.append(item[query_arg]) 544 | assert len(result) <= 1, "%d values matched %s, only allow 1" % ( 545 | len(result), kwargs) 546 | if result: 547 | return result[0] 548 | return None 549 | 550 | 551 | def get_tags(instance): 552 | """Returns instance tags.""" 553 | 554 | return get_instance_property(instance, 'tags') 555 | 556 | 557 | def get_public_ip(instance): 558 | return get_instance_property(instance, 'public_ip_address') 559 | 560 | 561 | def get_ip(instance): 562 | return get_instance_property(instance, 'private_ip_address') 563 | 564 | 565 | def get_instance_property(instance, property_name): 566 | """Retrieves property of an instance, keeps retrying until getting a non-None""" 567 | 568 | name = get_name(instance) 569 | while True: 570 | try: 571 | value = getattr(instance, property_name) 572 | if value is not None: 573 | break 574 | print(f"retrieving {property_name} on {name} produced None, retrying") 575 | time.sleep(RETRY_INTERVAL_SEC) 576 | instance.reload() 577 | continue 578 | except Exception as e: 579 | print(f"retrieving {property_name} on {name} failed with {e}, retrying") 580 | time.sleep(RETRY_INTERVAL_SEC) 581 | try: 582 | instance.reload() 583 | except Exception: 584 | pass 585 | continue 586 | 587 | return value 588 | 589 | 590 | def call_with_retries(method, debug_string='', 591 | retry_interval_sec=RETRY_INTERVAL_SEC, 592 | **kwargs): 593 | while True: 594 | try: 595 | value = method(**kwargs) 596 | assert value is not None, f"{debug_string} was None" 597 | break 598 | except Exception as e: 599 | print(f"{debug_string} failed with {e.__class__}({e}), retrying") 600 | time.sleep(retry_interval_sec) 601 | continue 602 | 603 | return value 604 | 605 | 606 | def get_ec2_resource(): 607 | try: 608 | client = get_session().resource('ec2') 609 | except Exception as e: 610 | print(f"Failed with error '{e}'") 611 | print("To specify Virginia region, do 'export AWS_DEFAULT_REGION=us-east-1'") 612 | sys.exit() 613 | return client 614 | 615 | 616 | def get_ec2_client(): 617 | try: 618 | client = get_session().client('ec2') 619 | except Exception as e: 620 | print(f"Failed with error '{e}'") 621 | print("To specify Virginia region, do 'export AWS_DEFAULT_REGION=us-east-1'") 622 | sys.exit() 623 | return client 624 | 625 | 626 | def get_efs_client(): 627 | while True: 628 | try: 629 | return get_session().client('efs') 630 | except Exception as e: 631 | # can get following 632 | # botocore.exceptions.DataNotFoundError: Unable to load data for: endpoints 633 | util.log(f"get_session().client('efs') failed with {e}, retrying") 634 | time.sleep(2) 635 | 636 | 637 | def is_good_response(response): 638 | """Helper method to check if boto3 call was a success.""" 639 | 640 | code = response["ResponseMetadata"]['HTTPStatusCode'] 641 | # get response code 201 on EFS creation 642 | return 200 <= code < 300 643 | 644 | 645 | def get_name(tags_or_instance_or_id): 646 | """Helper utility to extract name out of tags dictionary or intancce. 647 | [{'Key': 'Name', 'Value': 'nexus'}] -> 'nexus' 648 | 649 | Assert fails if there's more than one name. 650 | Returns '' if there's less than one name. 651 | """ 652 | 653 | ec2 = get_ec2_resource() 654 | if hasattr(tags_or_instance_or_id, 'tags'): 655 | tags = tags_or_instance_or_id.tags 656 | elif isinstance(tags_or_instance_or_id, str): 657 | tags = ec2.Instance(tags_or_instance_or_id).tags 658 | elif tags_or_instance_or_id is None: 659 | return EMPTY_NAME 660 | else: 661 | assert isinstance(tags_or_instance_or_id, 662 | Iterable), "expected iterable of tags" 663 | tags = tags_or_instance_or_id 664 | 665 | if not tags: 666 | return EMPTY_NAME 667 | names = [entry['Value'] for entry in tags if entry['Key'] == 'Name'] 668 | if not names: 669 | return '' 670 | if len(names) > 1: 671 | assert False, "have more than one name: " + str(names) 672 | return names[0] 673 | 674 | 675 | def wait_until_available(resource): 676 | """Waits until interval state becomes 'available'""" 677 | while True: 678 | resource.load() 679 | if resource.state == 'available': 680 | break 681 | time.sleep(RETRY_INTERVAL_SEC) 682 | 683 | 684 | def maybe_create_placement_group(name='', max_retries=10): 685 | """Creates placement_group group or reuses existing one. Crash if unable to create 686 | placement_group group. If name is empty, ignores request.""" 687 | 688 | if not name: 689 | return 690 | 691 | client = get_ec2_client() 692 | while True: 693 | try: 694 | client.describe_placement_groups(GroupNames=[name]) 695 | print("Reusing placement_group group: " + name) 696 | break # no Exception means group name was found 697 | except Exception: 698 | print("Creating placement_group group: " + name) 699 | try: 700 | _response = client.create_placement_group(GroupName=name, 701 | Strategy='cluster') 702 | except Exception: 703 | # because of race can get InvalidPlacementGroup.Duplicate 704 | pass 705 | 706 | counter = 0 707 | while True: 708 | try: 709 | res = client.describe_placement_groups(GroupNames=[name]) 710 | res_entry = res['PlacementGroups'][0] 711 | if res_entry['State'] == 'available': 712 | assert res_entry['Strategy'] == 'cluster' 713 | break 714 | except Exception as e: 715 | print("Got exception: %s" % (e,)) 716 | counter += 1 717 | if counter >= max_retries: 718 | assert False, f'Failed to create placement_group group {name} in {max_retries} attempts' 719 | time.sleep(RETRY_INTERVAL_SEC) 720 | 721 | 722 | # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#concepts-placement-groups 723 | def instance_supports_placement_groups(instance_type: str): 724 | regex = re.compile( 725 | "^(m4|m5|m5d|c3|c4|c5|c5d|cc2.8xlarge|cr1.8xlarge|r3|r4|r5|r5d|x1|x1e|z1d|d2|h1|hs1.8xlarge|i2|i3|i3.metal|f1|g2|g3|p2|p3).*$", 726 | re.IGNORECASE) 727 | return regex.match(instance_type) 728 | 729 | 730 | def lookup_instances(fragment, verbose=True, filter_by_key=True): 731 | """Returns ec2.Instance object whose name contains fragment, in reverse order of launching (ie, 732 | most recent intance first). Optionally filters by key, only including instances launched with 733 | key_name matching current username. 734 | 735 | args: 736 | verbose: print information about all matching instances found 737 | 738 | filter_by_key if True, ignore instances that are not launched with current 739 | user's default key 740 | """ 741 | 742 | def vprint(*args): 743 | if verbose: 744 | print(*args) 745 | 746 | region = get_region() 747 | client = get_ec2_client() 748 | ec2 = get_ec2_resource() 749 | response = client.describe_instances() 750 | assert is_good_response(response) 751 | 752 | instance_list = [] 753 | for instance in ec2.instances.all(): 754 | if instance.state['Name'] != 'running': 755 | continue 756 | 757 | name = get_name(instance) 758 | if (fragment in name or fragment in str(instance.public_ip_address) or 759 | fragment in str(instance.id) or fragment in str(instance.private_ip_address)): 760 | instance_list.append((util.toseconds(instance.launch_time), instance)) 761 | 762 | sorted_instance_list = reversed(sorted(instance_list, key=itemgetter(0))) 763 | filtered_instance_list = [] # filter by key 764 | vprint("Using region ", region) 765 | for (ts, instance) in sorted_instance_list: 766 | if filter_by_key and instance.key_name != get_keypair_name(): 767 | vprint(f"Got key {instance.key_name}, expected {get_keypair_name()}") 768 | continue 769 | filtered_instance_list.append(instance) 770 | return filtered_instance_list 771 | 772 | def create_spot_instances(launch_specs, spot_price=26, expiration_mins=15): 773 | """ 774 | args: 775 | spot_price: default is $26 which is right above p3.16xlarge on demand price 776 | expiration_mins: this request only valid for this many mins from now 777 | """ 778 | ec2c = get_ec2_client() 779 | 780 | num_tasks = launch_specs['MinCount'] or 1 781 | if 'MinCount' in launch_specs: del launch_specs['MinCount'] 782 | if 'MaxCount' in launch_specs: del launch_specs['MaxCount'] 783 | if 'TagSpecifications' in launch_specs: 784 | try: tags = launch_specs['TagSpecifications'][0]['Tags'] 785 | except: pass 786 | del launch_specs['TagSpecifications'] 787 | 788 | import pytz # datetime is not timezone aware, use pytz to fix 789 | import datetime as dt 790 | now = dt.datetime.utcnow().replace(tzinfo=pytz.utc) 791 | 792 | spot_args = {} 793 | spot_args['LaunchSpecification'] = launch_specs 794 | spot_args['SpotPrice'] = str(spot_price) 795 | spot_args['InstanceCount'] = num_tasks 796 | spot_args['ValidUntil'] = now + dt.timedelta(minutes=expiration_mins) 797 | 798 | try: 799 | spot_requests = ec2c.request_spot_instances(**spot_args) 800 | except Exception as e: 801 | assert False, f"Spot instance request failed (out of capacity?), error was {e}" 802 | 803 | spot_requests = spot_requests['SpotInstanceRequests'] 804 | instance_ids = wait_on_fulfillment(ec2c, spot_requests) 805 | 806 | print('Instances fullfilled...') 807 | ec2 = get_ec2_resource() 808 | instances = list(ec2.instances.filter(Filters=[{'Name': 'instance-id', 'Values': list(filter(None, instance_ids))}])) 809 | 810 | if not all(instance_ids): 811 | for i in instances: 812 | i.terminate() 813 | raise RuntimeError('Failed to create spot instances:', instance_ids) 814 | 815 | if tags: 816 | for i in instances: 817 | i.create_tags(Tags=tags) 818 | 819 | return instances 820 | 821 | 822 | def wait_on_fulfillment(ec2c, reqs): 823 | def get_instance_id(req): 824 | while req['State'] != 'active': 825 | print('Waiting on spot fullfillment...') 826 | time.sleep(5) 827 | reqs = ec2c.describe_spot_instance_requests(Filters=[{'Name': 'spot-instance-request-id', 'Values': [req['SpotInstanceRequestId']]}]) 828 | if not reqs['SpotInstanceRequests']: 829 | print(f"SpotInstanceRequest for {req['SpotInstanceRequestId']} not found") 830 | continue 831 | req = reqs['SpotInstanceRequests'][0] 832 | req_status = req['Status'] 833 | if req_status['Code'] not in ['pending-evaluation', 'pending-fulfillment', 'fulfilled']: 834 | print('Spot instance request failed:', req_status['Message']) 835 | print('Cancelling request. Please try again or use on demand.') 836 | ec2c.cancel_spot_instance_requests(SpotInstanceRequestIds=[req['SpotInstanceRequestId']]) 837 | print(req) 838 | return None 839 | instance_id = req['InstanceId'] 840 | print('Fulfillment completed. InstanceId:', instance_id) 841 | return instance_id 842 | return [get_instance_id(req) for req in reqs] 843 | -------------------------------------------------------------------------------- /ncluster/backend.py: -------------------------------------------------------------------------------- 1 | """Interface for job launching backend. 2 | 3 | Run/Job and Task are container classes encapsulating functionality. 4 | User creates them through make_run/make_job/make_task methods 5 | 6 | """ 7 | # Job launcher Python API: https://docs.google.com/document/d/1yTkb4IPJXOUaEWksQPCH7q0sjqHgBf3f70cWzfoFboc/edit 8 | # AWS job launcher (concepts): https://docs.google.com/document/d/1IbVn8_ckfVO3Z9gIiE0b9K3UrBRRiO9HYZvXSkPXGuw/edit 9 | import threading 10 | import time 11 | from typing import List, Tuple, Any 12 | 13 | from . import util 14 | 15 | # aws_backend.py 16 | # local_backend.py 17 | 18 | LOGDIR_ROOT: str = None # location of logdir for this backend 19 | 20 | """ 21 | backend = aws_backend # alternatively, backend=tmux_backend to launch jobs locally in separate tmux sessions 22 | run = backend.make_run("helloworld") # sets up /efs/runs/helloworld 23 | worker_job = run.make_job("worker", instance_type="g3.4xlarge", num_tasks=4, ami=ami, setup_script=setup_script) 24 | ps_job = run.make_job("ps", instance_type="c5.xlarge", num_tasks=4, ami=ami, setup_script=setup_script) 25 | setup_tf_config(worker_job, ps_job) 26 | ps_job.run("python cifar10_main.py --num_gpus=0") # runs command on each task 27 | worker_job.run("python cifar10_main.py --num_gpus=4") 28 | 29 | tb_job = run.make_job("tb", instance_type="m4.xlarge", num_tasks=1, public_port=6006) 30 | tb_job.run("tensorboard --logdir=%s --port=%d" %(run.logdir, 6006)) 31 | # when job has one task, job.task[0].ip can be accessed as job.ip 32 | print("See TensorBoard progress on %s:%d" %(tb_job.ip, 6006)) 33 | print("To interact with workers: %s" %(worker_job.connect_instructions)) 34 | 35 | 36 | To reconnect to existing job: 37 | 38 | """ 39 | 40 | 41 | class Task: 42 | name: str 43 | ip: str 44 | public_ip: str 45 | run_counter: int 46 | # location where temporary files from interfacing with task go locally 47 | local_scratch: str 48 | # location where temporary files from interfacing with task go on task 49 | remote_scratch: str 50 | job: Any # can't declare Job because of circular dependency 51 | 52 | def __init__(self, name=''): 53 | """Wraps execution resources into a task. Runs install_script if present""" 54 | self.last_status = None 55 | self.name = name 56 | self.instance = None 57 | self.install_script = None 58 | self.job = None 59 | self.kwargs = None 60 | self.public_ip = None 61 | self.ip = None 62 | self.logdir_ = None 63 | 64 | @property 65 | def logdir(self): 66 | raise NotImplementedError() 67 | 68 | def run(self, cmd: str, non_blocking=False, ignore_errors=False): 69 | """Runs command on given task.""" 70 | raise NotImplementedError() 71 | 72 | def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \ 73 | Tuple[str, str]: 74 | """ 75 | 76 | Args: 77 | cmd: single line shell command to run 78 | non_blocking (bool): if True, does not wait for command to finish 79 | ignore_errors: if True, will succeed even if command failed 80 | 81 | Returns: 82 | Contents of stdout/stderr as strings. 83 | Raises 84 | RuntimeException: if command produced non-0 returncode 85 | 86 | """ 87 | 88 | assert '\n' not in cmd, "Do not support multi-line commands" 89 | cmd: str = cmd.strip() 90 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 91 | return '', '' 92 | 93 | stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout" 94 | stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr" 95 | cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}" 96 | 97 | assert not non_blocking, "Getting output doesn't work with non_blocking" 98 | status = self.run(cmd2, False, ignore_errors=True) 99 | stdout = self.read(stdout_fn) 100 | stderr = self.read(stderr_fn) 101 | 102 | if self.last_status > 0: 103 | self.log(f"Warning: command '{cmd}' returned {status}," 104 | f" stdout was '{stdout}' stderr was '{stderr}'") 105 | if not ignore_errors: 106 | raise RuntimeError(f"Warning: command '{cmd}' returned {status}," 107 | f" stdout was '{stdout}' stderr was '{stderr}'") 108 | 109 | return stdout, stderr 110 | 111 | def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365, 112 | check_interval: float = 0.02) -> bool: 113 | """ 114 | Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec 115 | Args: 116 | fn: filename on task machine 117 | max_wait_sec: how long to wait in seconds 118 | check_interval: how often to check in seconds 119 | Returns: 120 | False if waiting was was cut short by max_wait_sec limit, True otherwise 121 | """ 122 | print("Waiting for file", fn) 123 | start_time = time.time() 124 | while True: 125 | if time.time() - start_time > max_wait_sec: 126 | util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}") 127 | return False 128 | if not self.exists(fn): 129 | time.sleep(check_interval) 130 | continue 131 | else: 132 | break 133 | return True 134 | 135 | def _run_raw(self, cmd): 136 | """Runs command directly on every task in the job, skipping tmux interface. Use if want to create/manage additional tmux sessions manually.""" 137 | raise NotImplementedError() 138 | 139 | def upload(self, local_fn: str, remote_fn: str = '', 140 | dont_overwrite: bool = False): 141 | """Uploads given file to the task. If remote_fn is not specified, dumps it 142 | into task current directory with the same name. 143 | 144 | Args: 145 | local_fn: location of file locally 146 | remote_fn: location of file on task 147 | dont_overwrite: if True, will be no-op if target file exists 148 | """ 149 | raise NotImplementedError() 150 | 151 | def download(self, remote_fn: str, local_fn: str = ''): 152 | """Downloads remote file to current directory.""" 153 | raise NotImplementedError() 154 | 155 | def write(self, fn, contents): 156 | """Write string contents to file fn in task.""" 157 | raise NotImplementedError() 158 | 159 | def read(self, fn): 160 | """Read contents of file and return it as string.""" 161 | raise NotImplementedError() 162 | 163 | def exists(self, fn) -> bool: 164 | """Checks if fn exists on task 165 | 166 | Args: 167 | fn: filename local to task 168 | Returns: 169 | true if fn exists on task machine 170 | """ 171 | raise NotImplementedError() 172 | 173 | def log(self, message, *args): 174 | """Log to launcher console.""" 175 | if args: 176 | message %= args 177 | 178 | print(f"{util.current_timestamp()} {self.name}: {message}") 179 | 180 | 181 | class Job: 182 | name: str 183 | tasks: List[Task] 184 | 185 | # run_: Run 186 | 187 | def __init__(self, name: str, tasks: List[Task] = None, **kwargs): 188 | """Initializes Job object, links tasks to refer back to the Job.""" 189 | if tasks is None: 190 | tasks = [] 191 | self.name = name 192 | self.tasks = tasks 193 | self.kwargs = kwargs 194 | # TODO: maybe backlinking is not needed 195 | for task in tasks: 196 | task.job = self 197 | 198 | @property 199 | def logdir(self): 200 | return self.tasks[0].logdir 201 | 202 | def _non_blocking_wrapper(self, method, *args, **kwargs): 203 | """Runs given method on every task in the job. Blocks until all tasks finish. Propagates exception from first 204 | failed task.""" 205 | 206 | exceptions = [] 207 | 208 | def task_run(task): 209 | try: 210 | getattr(task, method)(*args, **kwargs) 211 | except Exception as e: 212 | exceptions.append(e) 213 | 214 | threads = [threading.Thread(name=f'task_{method}_{i}', 215 | target=task_run, args=[t]) 216 | for i, t in enumerate(self.tasks)] 217 | for thread in threads: 218 | thread.start() 219 | for thread in threads: 220 | thread.join() 221 | if exceptions: 222 | raise exceptions[0] 223 | 224 | def run(self, *args, **kwargs): 225 | """Runs command on every task in the job in parallel, blocks until all tasks finish. 226 | See Task for documentation of args/kwargs.""" 227 | return self._non_blocking_wrapper("run", *args, **kwargs) 228 | 229 | def run_with_output(self, *args, **kwargs): 230 | """Runs command on every task in the job in parallel, blocks until all tasks finish. 231 | See Task for documentation of args/kwargs.""" 232 | return self._non_blocking_wrapper("run_with_output", *args, **kwargs) 233 | 234 | def upload(self, *args, **kwargs): 235 | """See :py:func:`backend.Task.upload`""" 236 | return self._non_blocking_wrapper("upload", *args, **kwargs) 237 | 238 | def write(self, *args, **kwargs): 239 | return self._non_blocking_wrapper("write", *args, **kwargs) 240 | 241 | def _run_raw(self, *args, **kwargs): 242 | return self._non_blocking_wrapper("_run_raw", *args, **kwargs) 243 | 244 | 245 | # Implementation needs to be backend specific so that run.create_job calls backend-specific method 246 | class Run: 247 | """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter 248 | server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and 249 | event files. 250 | :ivar aws_placement_group_name: somedoc 251 | """ 252 | jobs: List[Job] 253 | 254 | @property 255 | def logdir(self): 256 | raise NotImplementedError() 257 | 258 | # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods 259 | def run(self, *args, **kwargs): 260 | raise NotImplementedError() 261 | 262 | def run_with_output(self, *args, **kwargs): 263 | raise NotImplementedError() 264 | 265 | def _run_raw(self, *args, **kwargs): 266 | raise NotImplementedError() 267 | 268 | def upload(self, *args, **kwargs): 269 | raise NotImplementedError() 270 | 271 | def make_job(self, name='', **kwargs): 272 | raise NotImplementedError() 273 | 274 | 275 | def make_task(**_kwargs): 276 | raise NotImplementedError() 277 | 278 | 279 | def make_job(**_kwargs): 280 | raise NotImplementedError() 281 | 282 | 283 | def make_run(**_kwargs): 284 | raise NotImplementedError() 285 | -------------------------------------------------------------------------------- /ncluster/local_backend.py: -------------------------------------------------------------------------------- 1 | """Local implementation of backend.py using separate tmux sessions for jobs. 2 | 3 | Not thread-safe. 4 | """ 5 | 6 | import glob 7 | import os 8 | import shlex 9 | import socket 10 | import time 11 | from typing import List 12 | 13 | from ncluster import ncluster_globals 14 | from . import backend 15 | from . import util 16 | 17 | TASKDIR_ROOT = '/tmp/ncluster/task' 18 | SCRATCH_ROOT = '/tmp/ncluster/scratch' 19 | LOGDIR_ROOT = os.environ[ 20 | 'HOME'] + '/ncluster/runs' # use ~ instead of /tmp because /tmp gets wiped 21 | 22 | 23 | # todo: tmux session names are backwards from AWS job names (runname-jobname) 24 | # TODO: add kwargs so that tmux backend can be drop-in replacement 25 | 26 | 27 | # TODO: rename extra_kwargs to kwargs everywhere 28 | class Task(backend.Task): 29 | """Local tasks interact with tmux session where session name is derived 30 | from job name, and window names are task ids.""" 31 | tmux_window_id: int 32 | tmux_available_window_ids: List[int] 33 | 34 | def __init__(self, name, *, tmux_session, install_script='', job=None, 35 | **kwargs): 36 | 37 | self.homedir = os.environ['HOME'] 38 | self._cmd_fn = None 39 | self._cmd = None 40 | self._status_fn = None # location of output of last status 41 | self._out_fn = None 42 | 43 | self._can_run = False 44 | self.tmux_session = tmux_session 45 | self.tmux_window_id = 0 46 | self.tmux_available_window_ids = [0] 47 | 48 | self.name = name 49 | self.install_script = install_script 50 | self.job = job 51 | self.kwargs = kwargs 52 | 53 | # local servers sometimes listen only on localhost (TensorBoard), and sometimes only on 54 | # externally assigned ip address from gethostbyname (Ray), must choose one, so use the localhost for TB compatibility 55 | # https://github.com/ray-project/ray/issues/1677 56 | self.public_ip = socket.gethostbyname(socket.gethostname()) 57 | # self.public_ip = '127.0.0.1' 58 | self.ip = self.public_ip 59 | 60 | self.connect_instructions = 'tmux a -t ' + self.tmux_session 61 | 62 | # task current dir 63 | print('name is', name) 64 | # tmpdir = f"{util.reverse_taskname(name)}.{os.getpid()}.{util.now_micros()}" 65 | launch_id = util.random_id() 66 | self.taskdir = f"{TASKDIR_ROOT}/{name}-{launch_id}" 67 | self.local_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}" 68 | self.remote_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}" 69 | 70 | self.log(f"Creating taskdir {self.taskdir}") 71 | self._run_raw('mkdir -p ' + self.taskdir) 72 | 73 | self.log(f"Creating scratch {self.local_scratch}") 74 | self._run_raw('rm -Rf ' + self.local_scratch) 75 | self._run_raw('mkdir -p ' + self.local_scratch) 76 | self._run_raw('mkdir -p ' + self.remote_scratch) 77 | self.run_counter = 0 78 | 79 | self._cwd = self.taskdir 80 | self._can_run = True 81 | self.run('cd ' + self.taskdir) 82 | 83 | print("Running install script " + install_script) 84 | self.install_script = install_script 85 | for line in install_script.split('\n'): 86 | self.run(line) 87 | 88 | def run(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs): 89 | 90 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'): 91 | # HACK 92 | if not util.is_bash_builtin(cmd) or True: 93 | return self._run_with_output_on_failure(cmd, non_blocking, ignore_errors, **_kwargs) 94 | else: 95 | self.log("Found bash built-in, using regular run") 96 | 97 | if not self._can_run: 98 | assert False, "Using .run before initialization finished" 99 | if '\n' in cmd: 100 | cmds = cmd.split('\n') 101 | self.log( 102 | f"Running {len(cmds)} commands at once, returning status of last") 103 | status = -1 104 | for subcmd in cmds: 105 | status = self.run(subcmd) 106 | return status 107 | 108 | cmd = cmd.strip() 109 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 110 | return -1 111 | self.run_counter += 1 112 | self.log("tmux> %s", cmd) 113 | 114 | self._cmd = cmd 115 | self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd' 116 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 117 | assert not os.path.exists(self._status_fn) 118 | 119 | cmd = util.shell_strip_comment(cmd) 120 | # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 121 | 122 | self.write(self._cmd_fn, cmd + '\n') 123 | modified_cmd = f'{cmd} ; echo $? > {self._status_fn}' 124 | modified_cmd = shlex.quote(modified_cmd) 125 | 126 | tmux_window = self.tmux_session+':'+str(self.tmux_window_id) 127 | tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter' 128 | self._run_raw(tmux_cmd, ignore_errors=ignore_errors) 129 | if non_blocking: 130 | return 0 131 | 132 | if not self.wait_for_file(self._status_fn, max_wait_sec=60): 133 | self.log(f"Retrying waiting for {self._status_fn}") 134 | while not self.exists(self._status_fn): 135 | self.log(f"Still waiting for {cmd}") 136 | self.wait_for_file(self._status_fn, max_wait_sec=60) 137 | contents = self.read(self._status_fn) 138 | 139 | # if empty wait a bit to allow for race condition 140 | if len(contents) == 0: 141 | time.sleep(0.01) 142 | status = int(open(self._status_fn).read().strip()) 143 | self.last_status = status 144 | 145 | if status != 0: 146 | if not ignore_errors: 147 | raise RuntimeError(f"Command {cmd} returned status {status}") 148 | else: 149 | self.log(f"Warning: command {cmd} returned status {status}") 150 | 151 | return status 152 | 153 | def join(self, ignore_errors=False): 154 | """Waits until last executed command completed.""" 155 | assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it" 156 | check_interval = 0.2 157 | status_fn = self._status_fn 158 | if not self.wait_for_file(status_fn, max_wait_sec=30): 159 | self.log(f"Retrying waiting for {status_fn}") 160 | while not self.exists(status_fn): 161 | self.log(f"Still waiting for {self._cmd}") 162 | self.wait_for_file(status_fn, max_wait_sec=30) 163 | contents = self.read(status_fn) 164 | 165 | # if empty wait a bit to allow for race condition 166 | if len(contents) == 0: 167 | time.sleep(check_interval) 168 | contents = self.read(status_fn) 169 | status = int(contents.strip()) 170 | self.last_status = status 171 | 172 | if status != 0: 173 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 174 | if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'): 175 | self.log( 176 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 177 | self.log(f"\n{'*'*80}\nEnd failing output") 178 | if not ignore_errors: 179 | raise RuntimeError(f"Command {self._cmd} returned status {status}") 180 | else: 181 | self.log(f"Warning: command {self._cmd} returned status {status}") 182 | 183 | return status 184 | 185 | def switch_window(self, window_id: int): 186 | """ 187 | Switches currently active tmux window for given task. 0 is the default window 188 | Args: 189 | window_id: integer id of tmux window to use 190 | """ 191 | 192 | # windows are numbered sequentially 0, 1, 2, ... 193 | # create any missing windows and make them point to the same directory 194 | if window_id not in self.tmux_available_window_ids: 195 | for i in range(max(self.tmux_available_window_ids)+1, window_id+1): 196 | self._run_raw(f'tmux new-window -t {self.tmux_session} -d') 197 | 198 | tmux_window = self.tmux_session + ':' + str(i) 199 | cmd = shlex.quote(f'cd {self.taskdir}') 200 | tmux_cmd = f'tmux send-keys -t {tmux_window} {cmd} Enter' 201 | self._run_raw(tmux_cmd) 202 | self.tmux_available_window_ids.append(i) 203 | 204 | self.tmux_window_id = window_id 205 | 206 | # This is a future "run" command, will become "run" once all cases are checked 207 | def _run_with_output_on_failure(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs) -> str: 208 | if not self._can_run: 209 | assert False, "Using .run before initialization finished" 210 | if '\n' in cmd: 211 | cmds = cmd.split('\n') 212 | self.log( 213 | f"Running {len(cmds)} commands at once, returning status of last") 214 | status = -1 215 | for subcmd in cmds: 216 | status = self.run(subcmd) 217 | return status 218 | 219 | cmd = cmd.strip() 220 | if not cmd or cmd.startswith('#'): # ignore empty/commented out lines 221 | return '' 222 | self.run_counter += 1 223 | self.log("tmux> %s", cmd) 224 | 225 | self._cmd = cmd 226 | self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd' 227 | self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status' 228 | self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out' 229 | assert not os.path.exists(self._status_fn) 230 | 231 | cmd = util.shell_strip_comment(cmd) 232 | # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things" 233 | 234 | self.write(self._cmd_fn, cmd + '\n') 235 | # modified_cmd = f'{cmd} ; echo $? > {self._status_fn}' 236 | modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}' 237 | modified_cmd = shlex.quote(modified_cmd) 238 | 239 | tmux_window = self.tmux_session+':'+str(self.tmux_window_id) 240 | tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter' 241 | self._run_raw(tmux_cmd) 242 | if non_blocking: 243 | return '' 244 | 245 | if not self.wait_for_file(self._status_fn, max_wait_sec=60): 246 | self.log(f"Retrying waiting for {self._status_fn}") 247 | while not self.exists(self._status_fn): 248 | self.log(f"Still waiting for {cmd}") 249 | self.wait_for_file(self._status_fn, max_wait_sec=60) 250 | contents = self.read(self._status_fn) 251 | 252 | # if empty wait a bit to allow for race condition 253 | if len(contents) == 0: 254 | time.sleep(0.01) 255 | status = int(open(self._status_fn).read().strip()) 256 | self.last_status = status 257 | 258 | if status != 0: 259 | extra_msg = '(ignoring error)' if ignore_errors else '(failing)' 260 | self.log( 261 | f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'") 262 | self.log(f"\n{'*'*80}\nEnd failing output") 263 | if not ignore_errors: 264 | raise RuntimeError(f"Command {cmd} returned status {status}") 265 | else: 266 | self.log(f"Warning: command {cmd} returned status {status}") 267 | 268 | return self.read(self._out_fn) 269 | 270 | def _run_raw(self, cmd, ignore_errors=False): 271 | """Runs command directly, skipping tmux interface""" 272 | # TODO: capture stdout/stderr for feature parity with aws_backend 273 | result = os.system(cmd) 274 | if result != 0: 275 | if ignore_errors: 276 | self.log(f"command ({cmd}) failed.") 277 | assert False, "_run_raw failed" 278 | 279 | def upload(self, local_fn, remote_fn=None, dont_overwrite=False): 280 | """Uploads file to remote instance. If location not specified, dumps it 281 | into default directory. Creates missing directories in path name.""" 282 | 283 | # support wildcard through glob 284 | if '*' in local_fn: 285 | for local_subfn in glob.glob(local_fn): 286 | self.upload(local_subfn) 287 | return 288 | 289 | if remote_fn is None: 290 | remote_fn = os.path.basename(local_fn) 291 | 292 | if dont_overwrite and self.exists(remote_fn): 293 | self.log("Remote file %s exists, skipping" % (remote_fn,)) 294 | return 295 | 296 | if not remote_fn.startswith('/'): 297 | remote_fn = self.taskdir + '/' + remote_fn 298 | 299 | remote_fn = remote_fn.replace('~', self.homedir) 300 | self.log('uploading ' + local_fn + ' to ' + remote_fn) 301 | 302 | local_fn = os.path.abspath(local_fn) 303 | self._run_raw("cp -R %s %s" % (local_fn, remote_fn)) 304 | 305 | def download(self, remote_fn, local_fn='.'): 306 | if local_fn == '.': 307 | local_fn = self._cwd 308 | # self.log("downloading %s to %s" % (remote_fn, local_fn)) 309 | if not remote_fn.startswith('/'): 310 | remote_fn = self._cwd + '/' + remote_fn 311 | if self.exists(remote_fn): 312 | os.system(f'cp {remote_fn} {local_fn}') 313 | else: 314 | raise RuntimeError(f"No such file {remote_fn}") 315 | 316 | def exists(self, remote_fn): 317 | return os.path.exists(remote_fn) 318 | 319 | def read(self, remote_fn): 320 | tmp_fn = self.local_scratch + '/' + str(util.now_micros()) 321 | self.download(remote_fn, tmp_fn) 322 | return open(tmp_fn).read() 323 | 324 | def write(self, remote_fn, contents): 325 | def make_temp_fn(): 326 | """Returns temporary filename for this task.""" 327 | return self.local_scratch + '/write.' + str(util.now_micros()) 328 | 329 | tmp_fn = make_temp_fn() 330 | open(tmp_fn, 'w').write(contents) 331 | self.upload(tmp_fn, remote_fn) 332 | 333 | # don't include file streaming for now 334 | # the issue is that file streaming by default turns on 4K buffering, which makes 335 | # streaming a lot less useful. Similar buffering is turned on for piping commands 336 | # https://unix.stackexchange.com/questions/25372/turn-off-buffering-in-pipe 337 | # def file_stream(self, fn: str) -> None: 338 | # # if not fn.startswith('/'): 339 | # # fn = self.taskdir + '/' + fn 340 | # 341 | # if not os.path.exists(fn): 342 | # os.system('mkdir -p ' + os.path.dirname(os.path.abspath(fn))) 343 | # os.system('touch ' + fn) 344 | # 345 | # p = subprocess.Popen(['tail', '-f', fn], stdout=subprocess.PIPE) 346 | # 347 | # for line in iter(p.stdout.readline, ''): 348 | # sys.stdout.write(line.decode('ascii', errors='ignore')) 349 | 350 | @property 351 | def logdir(self): 352 | """Returns logging directory, creating one if necessary. See "Logdir" section of design doc on naming convention.""" 353 | 354 | run_name = ncluster_globals.get_run_for_task(self) 355 | logdir = ncluster_globals.get_logdir(run_name) 356 | if logdir: 357 | return logdir 358 | 359 | # create logdir. Only single task in a group creates the logdir 360 | if ncluster_globals.is_chief(self, run_name): 361 | chief = self 362 | else: 363 | chief = ncluster_globals.get_chief(run_name) 364 | 365 | chief.setup_logdir() 366 | return ncluster_globals.get_logdir(run_name) 367 | # release lock 368 | 369 | def setup_logdir(self): 370 | # todo: locking on logdir creation 371 | 372 | """Create logdir for task/job/run. No-op if the task is not chief (0'th task of 0'th job of run) 373 | """ 374 | run_name = ncluster_globals.get_run_for_task(self) 375 | self.log("Creating logdir for run "+run_name) 376 | logdir_root = ncluster_globals.LOGDIR_ROOT 377 | assert logdir_root 378 | 379 | self.run(f'mkdir -p {logdir_root}') 380 | find_command = f'find {logdir_root} -maxdepth 1 -type d' 381 | 382 | stdout, stderr = self.run_with_output(find_command) 383 | logdir = f"{logdir_root}/{run_name}" 384 | 385 | counter = 0 386 | while logdir in stdout: 387 | counter += 1 388 | new_logdir = f'{logdir_root}/{run_name}.{counter:02d}' 389 | self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}') 390 | logdir = new_logdir 391 | self.run(f'mkdir -p {logdir}') 392 | 393 | ncluster_globals.set_logdir(run_name, logdir) 394 | return logdir 395 | 396 | 397 | class Job(backend.Job): 398 | pass 399 | 400 | 401 | class Run: 402 | """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter 403 | server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and 404 | event files. 405 | :ivar aws_placement_group_name: somedoc 406 | """ 407 | jobs: List[Job] 408 | 409 | def __init__(self, name='', **kwargs): 410 | """Creates a run. If install_script is specified, it's used as default 411 | install_script for all jobs (can be overridden by Job constructor)""" 412 | 413 | self.name = name 414 | self.kwargs = kwargs 415 | 416 | @property 417 | def logdir(self): 418 | chief_task = ncluster_globals.get_chief(self.name) 419 | return chief_task.logdir 420 | 421 | # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods 422 | def run(self, *args, **kwargs): 423 | """Runs command on every job in the run.""" 424 | 425 | for job in self.jobs: 426 | job.run(*args, **kwargs) 427 | 428 | def run_with_output(self, *args, **kwargs): 429 | """Runs command on every first job in the run, returns stdout.""" 430 | for job in self.jobs: 431 | job.run_with_output(*args, **kwargs) 432 | 433 | def _run_raw(self, *args, **kwargs): 434 | """_run_raw on every job in the run.""" 435 | for job in self.jobs: 436 | job._run_raw(*args, **kwargs) 437 | 438 | def upload(self, *args, **kwargs): 439 | """Runs command on every job in the run.""" 440 | for job in self.jobs: 441 | job.upload(*args, **kwargs) 442 | 443 | def make_job(self, name='', **kwargs): 444 | return make_job(name+'.'+self.name, run_name=self.name, **kwargs) 445 | 446 | 447 | def make_task(name='', 448 | run_name='', 449 | **kwargs) -> Task: 450 | """Create task, also create dummy run if not specified.""" 451 | ncluster_globals.task_launched = True 452 | 453 | name = ncluster_globals.auto_assign_task_name_if_needed(name) 454 | 455 | # tmux can't use . for session names 456 | tmux_session = name.replace('.', '=') 457 | tmux_window_id = 0 458 | util.log(f'killing session {tmux_session}') 459 | 460 | if not util.is_set("NCLUSTER_NOKILL_TMUX"): 461 | os.system(f'tmux kill-session -t {tmux_session}') 462 | os.system(f'tmux new-session -s {tmux_session} -n {tmux_window_id} -d') 463 | 464 | task = Task(name, 465 | tmux_session=tmux_session, # propagate optional args 466 | run_name=run_name, 467 | **kwargs) 468 | ncluster_globals.register_task(task, run_name) 469 | return task 470 | 471 | 472 | def make_job(name="", 473 | num_tasks=1, 474 | run_name="", 475 | install_script='', 476 | **kwargs 477 | ) -> backend.Job: 478 | assert num_tasks > 0, f"Can't create job with {num_tasks} tasks" 479 | 480 | name = ncluster_globals.auto_assign_job_name_if_needed(name) 481 | util.validate_ncluster_job_name(name) 482 | tasks = [make_task(f"{i}.{name}", 483 | run_name=run_name, 484 | install_script=install_script, 485 | **kwargs 486 | ) for i in range(num_tasks)] 487 | 488 | job = backend.Job(name=name, tasks=tasks, **kwargs) 489 | return job 490 | 491 | 492 | def make_run(name) -> Run: 493 | run = Run(name) 494 | return run 495 | -------------------------------------------------------------------------------- /ncluster/ncluster.py: -------------------------------------------------------------------------------- 1 | from . import aws_backend 2 | from . import local_backend 3 | from . import backend 4 | from . import aws_util as u 5 | import collections 6 | 7 | from . import ncluster_globals 8 | 9 | _backend: type(backend) = backend 10 | 11 | 12 | def get_logdir_root() -> str: 13 | return _backend.LOGDIR_ROOT 14 | 15 | 16 | def set_logdir_root(logdir_root): 17 | """Globally changes logdir root for all runs.""" 18 | _backend.LOGDIR_ROOT = logdir_root 19 | 20 | 21 | def set_backend(backend_name: str): 22 | """Sets backend (local or aws)""" 23 | global _backend, _backend_name 24 | _backend_name = backend_name 25 | 26 | assert not ncluster_globals.task_launched, "Not allowed to change backend after launching a task (this pattern is error-prone)" 27 | if backend_name == 'aws': 28 | _backend = aws_backend 29 | elif backend_name == 'local': 30 | _backend = local_backend 31 | else: 32 | assert False, f"Unknown backend {backend_name}" 33 | ncluster_globals.LOGDIR_ROOT = _backend.LOGDIR_ROOT 34 | 35 | 36 | def use_aws(): 37 | set_backend('aws') 38 | 39 | 40 | def use_local(): 41 | set_backend('local') 42 | 43 | 44 | def get_backend() -> str: 45 | """Returns backend name, ie "local" or "aws" """ 46 | return _backend_name 47 | 48 | 49 | def get_backend_module() -> backend: 50 | return _backend 51 | 52 | 53 | def running_locally(): 54 | return get_backend() == 'local' 55 | 56 | 57 | def get_region() -> str: 58 | if _backend != local_backend: 59 | return u.get_region() 60 | else: 61 | return 'local' 62 | 63 | 64 | def get_zone() -> str: 65 | if _backend != local_backend: 66 | return u.get_zone() 67 | else: 68 | return 'local' 69 | 70 | 71 | # def make_run(name='', **kwargs): 72 | # return _backend.Run(name, **kwargs) 73 | 74 | 75 | # Use factory methods task=create_task instead of relying solely on constructors task=Task() because underlying hardware resources may be reused between instantiations 76 | # For instance, one may create a Task initialized with an instance that was previous created for this kind of task 77 | # Factory method will make the decision to recreate or reuse such resource, and wrap this resource with a Task object. 78 | def make_task(name: str = '', 79 | run_name: str = '', 80 | install_script: str = '', 81 | **kwargs) -> backend.Task: 82 | return _backend.make_task(name=name, run_name=run_name, 83 | install_script=install_script, **kwargs) 84 | 85 | 86 | def make_job(name: str = '', 87 | run_name: str = '', 88 | num_tasks: int = 0, 89 | install_script: str = '', 90 | **kwargs 91 | ) -> backend.Job: 92 | """ 93 | Create a job using current backend. Blocks until all tasks are up and initialized. 94 | 95 | Args: 96 | name: name of the job 97 | run_name: name of the run (auto-assigned if empty) 98 | num_tasks: number of tasks 99 | install_script: bash-runnable script 100 | **kwargs: 101 | 102 | Returns: 103 | backend.Job 104 | """ 105 | return _backend.make_job(name=name, run_name=run_name, num_tasks=num_tasks, 106 | install_script=install_script, **kwargs) 107 | 108 | 109 | def make_run(name: str = '', **kwargs) -> backend.Run: 110 | return _backend.make_run(name=name, **kwargs) 111 | 112 | 113 | # TODO: remove? 114 | def join(things_to_join): 115 | if isinstance(things_to_join, collections.Iterable): 116 | for thing in things_to_join: 117 | thing.join() 118 | else: 119 | things_to_join.join() 120 | -------------------------------------------------------------------------------- /ncluster/ncluster_globals.py: -------------------------------------------------------------------------------- 1 | """Module that keeps global state of ncluster tasks, such as naming, 2 | connection of tasks to runs 3 | 4 | run refers to string name 5 | run_object refers to Run object corresponding to that name 6 | 7 | """ 8 | import os 9 | import sys 10 | from typing import Dict, Any, List 11 | from . import util 12 | from . import backend 13 | 14 | LOGDIR_ROOT = None 15 | task_launched = False # keep track whether anything has been launched 16 | 17 | task_counter = 0 18 | job_counter = 0 19 | run_counter = 0 20 | 21 | run_dict: Dict[str, Any] = {} 22 | task_run_dict: Dict[backend.Task, str] = {} 23 | run_task_dict: Dict[str, backend.Task] = {} 24 | run_logdir_dict: Dict[str, str] = {} 25 | 26 | tasks_seen: List[backend.Task] = [] # list of all tasks created 27 | 28 | 29 | def auto_assign_task_name_if_needed(name, instance_type='', image_name='', 30 | tasks=1): 31 | global task_counter 32 | if name: 33 | return name 34 | 35 | main_script = os.path.abspath(sys.argv[0]) 36 | script_id = util.alphanumeric_hash( 37 | f"{main_script}-{instance_type}-{image_name}-{tasks}") 38 | name = f"unnamedtask-{task_counter}-{script_id}" 39 | task_counter += 1 40 | return name 41 | 42 | 43 | def auto_assign_job_name_if_needed(name): 44 | global job_counter 45 | if name: 46 | return name 47 | script_id = util.alphanumeric_hash(sys.argv[0]) 48 | name = f"unnamedjob-{job_counter}-{script_id}" 49 | job_counter += 1 50 | return name 51 | 52 | 53 | def auto_assign_run_name_if_needed(name): 54 | global run_counter 55 | if name: 56 | return name 57 | script_id = util.alphanumeric_hash(sys.argv[0]) 58 | name = f"unnamedrun-{run_counter}-{script_id}" 59 | run_counter += 1 60 | return name 61 | 62 | 63 | # def add_job_to_run(job, run_name): 64 | # global run_dict, job_run_dict 65 | # return job_run_dict.get(job, '') 66 | # 67 | 68 | # def register_run(name: str, run): 69 | # global run_dict, placement_dict 70 | # run_dict[name] = run 71 | # 72 | 73 | 74 | def register_task(task: Any, run_name: str): 75 | global task_run_dict, run_task_dict, tasks_seen 76 | assert task.name not in tasks_seen 77 | tasks_seen.append(task.name) 78 | task_run_dict[task] = run_name 79 | run_task_dict.setdefault(run_name, []).append(task) 80 | 81 | 82 | def register_run(run: backend.Run, run_name) -> None: 83 | assert run_name not in run_dict 84 | assert run_name # empty name reserved to mean no run 85 | run_dict[run_name] = run 86 | 87 | 88 | def is_chief(task: backend.Task, run_name: str): 89 | """Returns True if task is chief task in the corresponding run""" 90 | global run_task_dict 91 | if run_name not in run_task_dict: 92 | return True 93 | task_list = run_task_dict[run_name] 94 | assert task in task_list, f"Task {task.name} doesn't belong to run {run_name}" 95 | return task_list[0] == task 96 | 97 | 98 | def get_chief(run_name: str): 99 | assert run_name in run_task_dict, f"Run {run_name} doesn't exist" 100 | tasks = run_task_dict[run_name] 101 | assert tasks, f"Run {run_name} had tasks {tasks}, expected non-empty list" 102 | return tasks[0] 103 | 104 | 105 | def get_logdir(run_name: str): 106 | """Returns logdir for this run. It is the job of logdir creator to set logdir for this run""" 107 | 108 | if not run_name: 109 | return '/tmp' 110 | return run_logdir_dict.get(run_name, '') 111 | 112 | 113 | def get_run_object(run_name: str) -> backend.Run: 114 | return run_dict.get(run_name, None) 115 | 116 | 117 | def set_logdir(run_name, logdir): 118 | assert run_name not in run_logdir_dict, f"logdir for run {run_name} has already been set to {run_logdir_dict[run_name]}, trying to change it to {logdir} is illegal" 119 | run_logdir_dict[run_name] = logdir 120 | 121 | 122 | def get_run_for_task(task: backend.Task) -> str: 123 | """Gets run name associated with given Task""" 124 | return task_run_dict.get(task, '') 125 | 126 | 127 | def get_run_object(run_name: str) -> backend.Run: 128 | return run_dict.get(run_name, None) 129 | 130 | 131 | def create_run_if_needed(run_name, run_creation_callback) -> backend.Run: 132 | if run_name in run_dict: 133 | return run_dict[run_name] 134 | run = run_creation_callback(run_name) 135 | return run 136 | -------------------------------------------------------------------------------- /ncluster/summary.txt: -------------------------------------------------------------------------------- 1 | tf_two_machines -- 500 on t3, 910 on c3 2 | 3 | -------------------------------------------------------------------------------- /ncluster/test.py: -------------------------------------------------------------------------------- 1 | 2 | print("%20s" % ('asdfasdf',)) 3 | print(f"{'asdfasdf':>20}") 4 | 5 | print("%5.2f" % (5.5,)) 6 | print(f"{5.5:5.2f}") 7 | -------------------------------------------------------------------------------- /ncluster/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various helper utilities used internally by ncluster project, but may be potentially 3 | useful outside of the cluster project. 4 | """ 5 | 6 | import os 7 | import random 8 | import string 9 | import sys 10 | import time 11 | from collections import Iterable 12 | import shlex 13 | 14 | # starting value for now_micros (Aug 31, 2018) 15 | # using this to make various timestamped names shorter 16 | EPOCH_MICROS = 1535753974788163 17 | 18 | 19 | def is_iterable(k): 20 | return isinstance(k, Iterable) 21 | 22 | 23 | def now_micros(absolute=False) -> int: 24 | """Return current micros since epoch as integer.""" 25 | micros = int(time.time() * 1e6) 26 | if absolute: 27 | return micros 28 | return micros - EPOCH_MICROS 29 | 30 | 31 | def now_millis(absolute=False) -> int: 32 | """Return current millis since epoch as integer.""" 33 | millis = int(time.time() * 1e3) 34 | if absolute: 35 | return millis 36 | return millis - EPOCH_MICROS // 1000 37 | 38 | 39 | def current_timestamp() -> str: 40 | # timestamp format from https://github.com/tensorflow/tensorflow/blob/155b45698a40a12d4fef4701275ecce07c3bb01a/tensorflow/core/platform/default/logging.cc#L80 41 | current_seconds = time.time() 42 | remainder_micros = int(1e6 * (current_seconds - int(current_seconds))) 43 | time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_seconds)) 44 | full_time_str = "%s.%06d" % (time_str, remainder_micros) 45 | return full_time_str 46 | 47 | 48 | def log_error(*args, **kwargs): 49 | print(f"Error encountered {args} {kwargs}") 50 | 51 | 52 | def log(*args, **kwargs): 53 | print(f"{args} {kwargs}") 54 | 55 | 56 | def install_pdb_handler(): 57 | """Make CTRL+\ break into gdb.""" 58 | 59 | import signal 60 | import pdb 61 | 62 | def handler(_signum, _frame): 63 | pdb.set_trace() 64 | 65 | signal.signal(signal.SIGQUIT, handler) 66 | 67 | 68 | def shell_add_echo(script): 69 | """Goes over each line script, adds "echo cmd" in front of each cmd. 70 | 71 | ls a 72 | 73 | becomes 74 | 75 | echo * ls a 76 | ls a 77 | """ 78 | new_script = "" 79 | for cmd in script.split('\n'): 80 | cmd = cmd.strip() 81 | if not cmd: 82 | continue 83 | new_script += "echo \\* " + shlex.quote(cmd) + "\n" 84 | new_script += cmd + "\n" 85 | return new_script 86 | 87 | 88 | def shell_strip_comment(cmd): 89 | """ hi # testing => hi""" 90 | if '#' in cmd: 91 | return cmd.split('#', 1)[0] 92 | else: 93 | return cmd 94 | 95 | 96 | def random_id(k=5): 97 | """Random id to use for AWS identifiers.""" 98 | # https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python 99 | return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k)) 100 | 101 | 102 | def alphanumeric_hash(s: str, size=5): 103 | """Short alphanumeric string derived from hash of given string""" 104 | import hashlib 105 | import base64 106 | hash_object = hashlib.md5(s.encode('ascii')) 107 | s = base64.b32encode(hash_object.digest()) 108 | result = s[:size].decode('ascii').lower() 109 | return result 110 | 111 | 112 | def reverse_taskname(name: str) -> str: 113 | """ 114 | Reverses components in the name of task. Reversed convention is used for filenames since 115 | it groups log/scratch files of related tasks together 116 | 117 | 0.somejob.somerun -> somerun.somejob.0 118 | 0.somejob -> somejob.0 119 | somename -> somename 120 | 121 | Args: 122 | name: name of task 123 | 124 | """ 125 | components = name.split('.') 126 | assert len(components) <= 3 127 | return '.'.join(components[::-1]) 128 | 129 | 130 | def is_bash_builtin(cmd): 131 | """Return true if command is invoking bash built-in 132 | """ 133 | # from compgen -b 134 | bash_builtins = ['alias', 'bg', 'bind', 'alias', 'bg', 'bind', 'break', 135 | 'builtin', 'caller', 'cd', 'command', 'compgen', 'complete', 136 | 'compopt', 'continue', 'declare', 'dirs', 'disown', 'echo', 137 | 'enable', 'eval', 'exec', 'exit', 'export', 'false', 'fc', 138 | 'fg', 'getopts', 'hash', 'help', 'history', 'jobs', 'kill', 139 | 'let', 'local', 'logout', 'mapfile', 'popd', 'printf', 140 | 'pushd', 'pwd', 'read', 'readarray', 'readonly', 'return', 141 | 'set', 'shift', 'shopt', 'source', 'suspend', 'test', 142 | 'times', 'trap', 'true', 'type', 'typeset', 'ulimit', 143 | 'umask', 'unalias', 'unset', 'wait'] 144 | toks = cmd.split() 145 | if toks and toks[0] in bash_builtins: 146 | return True 147 | return False 148 | 149 | 150 | def is_set(name): 151 | """Helper method to check if given property is set""" 152 | val = os.environ.get(name, '0') 153 | assert val == '0' or val == '1', f"env var {name} has value {val}, expected 0 or 1" 154 | return val == '1' 155 | 156 | 157 | def assert_script_in_current_directory(): 158 | """Assert fail if current directory is different from location of the script""" 159 | 160 | script = sys.argv[0] 161 | assert os.path.abspath(os.path.dirname(script)) == os.path.abspath( 162 | '.'), f"Change into directory of script {script} and run again." 163 | 164 | 165 | def validate_ncluster_job_name(name): 166 | assert name.count( 167 | '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for convention)" 168 | 169 | 170 | def toseconds(dt): 171 | """Converts datetime object to seconds.""" 172 | return time.mktime(dt.utctimetuple()) 173 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wrapt 2 | ray 3 | numpy 4 | torch 5 | tensorflow 6 | boto3 7 | paramiko 8 | portpicker 9 | tzlocal 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = ncluster 3 | version = 0.1.20 4 | author = Yaroslav Bulatov, Andrew Shaw 5 | author_email = yaroslavvb@gmail.com 6 | description= Lightweight interface to launching jobs in the cloud 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | license_file = LICENSE 10 | url = https://github.com/diux-dev/ncluster 11 | classifiers = 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: MIT License 14 | Operating System :: OS Independent 15 | 16 | [options] 17 | python_requires = >= 3.6 18 | setup_requires = 19 | setuptools >= 38.6 20 | pip >= 10 21 | twine >= 1.11 22 | packages = find: 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | setup(scripts=['ncluster/aws_create_resources.py', 3 | 'ncluster/aws_delete_resources.py']) 4 | 5 | -------------------------------------------------------------------------------- /tests/join_test.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | import pytest 3 | 4 | def test(): 5 | task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE) 6 | task.run("mkdir /illegal", non_blocking=True) 7 | task.join(ignore_errors=True) # this succeed/print error message 8 | 9 | task.run("mkdir /illegal", non_blocking=True) 10 | with pytest.raises(RuntimeError): 11 | task.join() # this should fail 12 | 13 | if __name__ == '__main__': 14 | test() 15 | -------------------------------------------------------------------------------- /tests/logdir_test.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # tests to make sure that logdir logic works 3 | import inspect 4 | import random 5 | import sys 6 | import threading 7 | 8 | import ncluster 9 | 10 | 11 | def test_two_jobs(): 12 | run = ncluster.make_run('logdir_test') 13 | job1 = run.make_job('job1') 14 | task1 = job1.tasks[0] 15 | task1.run(f'echo hello > {task1.logdir}/message') 16 | job2 = run.make_job('job2') 17 | task2 = job2.tasks[0] 18 | assert task2.read(f'{task2.logdir}/message').strip() == 'hello' 19 | 20 | 21 | def test_multiple_logdirs(): 22 | logdir1 = ncluster.get_logdir_root() + '/test1' 23 | dummy_task = ncluster.make_task() 24 | dummy_task.run(f'rm -Rf {logdir1}') 25 | task1 = ncluster.make_task(run_name='test1') 26 | assert task1.logdir == logdir1 27 | 28 | logdir2 = ncluster.get_logdir_root() + '/test2' 29 | task2 = ncluster.make_task(run_name='test2') 30 | dummy_task.run(f'rm -Rf {logdir2}*') 31 | dummy_task.run(f'mkdir {logdir2}') 32 | assert task2.logdir == logdir2 + '.01' 33 | 34 | 35 | def test_multiple_logdir_tasks(): 36 | n = 10 37 | dummy_task = ncluster.make_task() 38 | logdir1 = ncluster.get_logdir_root() + '/test1' 39 | dummy_task.run(f'rm -Rf {logdir1}') 40 | job = ncluster.make_job(run_name='test1', num_tasks=n) 41 | 42 | obtained_logdirs = [] 43 | 44 | import wrapt 45 | 46 | @wrapt.synchronized 47 | def query(i): 48 | obtained_logdirs.append(job.tasks[i].logdir) 49 | 50 | threads = [threading.Thread(target=query, args=(i,)) for i in range(n)] 51 | for thread in reversed(threads): 52 | thread.start() 53 | 54 | random.shuffle(threads) 55 | for thread in threads: 56 | thread.join() 57 | 58 | assert len(set(obtained_logdirs)) == 1 59 | assert obtained_logdirs[0] == logdir1 60 | 61 | 62 | def run_all_tests(module): 63 | all_functions = inspect.getmembers(module, inspect.isfunction) 64 | for name, func in all_functions: 65 | if name.startswith('test'): 66 | print("Testing " + name) 67 | func() 68 | print(module.__name__ + " tests passed.") 69 | 70 | 71 | def manual(): 72 | run_all_tests(sys.modules[__name__]) 73 | 74 | 75 | if __name__ == '__main__': 76 | manual() 77 | -------------------------------------------------------------------------------- /tests/run_test.py: -------------------------------------------------------------------------------- 1 | import ncluster 2 | 3 | def test(): 4 | run = ncluster.make_run('run_test') 5 | job1 = run.make_job('job1') 6 | task1 = job1.tasks[0] 7 | assert task1.name == '0.job1.run_test' 8 | task1.run(f'echo task1sayshello > {task1.logdir}/message') 9 | job2 = run.make_job('job2') 10 | task2 = job2.tasks[0] 11 | assert task2.name == '0.job2.run_test' 12 | assert task2.read(f'{task2.logdir}/message').strip() == 'task1sayshello' 13 | 14 | 15 | if __name__ == '__main__': 16 | test() 17 | --------------------------------------------------------------------------------