├── .doctrees
    ├── .eggs
    │   ├── requests-2.19.1-py3.6.egg
    │   │   └── EGG-INFO
    │   │   │   └── DESCRIPTION.doctree
    │   └── urllib3-1.23-py3.6.egg
    │   │   └── EGG-INFO
    │   │       └── DESCRIPTION.doctree
    ├── environment.pickle
    └── index.doctree
├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
    ├── README.md
    ├── mpi_two_machines.py
    ├── pytorch_two_machines.py
    ├── ray_ps.py
    ├── ray_two_machines.py
    ├── ray_two_machines_local.py
    ├── requirements.txt
    ├── summary.txt
    ├── tf_two_machines.py
    ├── tf_two_machines_local.py
    └── util.py
├── examples
    ├── deleteme.py
    ├── gpubox.py
    ├── gpubox_jupyter_notebook_config.py
    ├── gpubox_sample.ipynb
    ├── launch_16_instances.py
    ├── ray_example.py
    ├── requirements.txt
    ├── simple_job.py
    ├── simple_task.py
    ├── simple_tf.py
    ├── tf_adder.py
    └── tf_adder_tb.py
├── ncluster
    ├── __init__.py
    ├── aws_backend.py
    ├── aws_create_resources.py
    ├── aws_delete_resources.py
    ├── aws_util.py
    ├── backend.py
    ├── local_backend.py
    ├── ncluster.py
    ├── ncluster_globals.py
    ├── summary.txt
    ├── test.py
    └── util.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── join_test.py
    ├── logdir_test.py
    └── run_test.py


/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree


--------------------------------------------------------------------------------
/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree


--------------------------------------------------------------------------------
/.doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/environment.pickle


--------------------------------------------------------------------------------
/.doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diux-dev/ncluster/2fd359621896717197b479c7174d06d80df1529b/.doctrees/index.doctree


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /dist
 2 | /build
 3 | /.DS_Store
 4 | /ncluster.egg-info
 5 | /ncluster/__pycache__
 6 | /.eggs
 7 | /ncluster/.idea
 8 | /.idea
 9 | __pycache__
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2018] [Yaroslav Bulatov]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ncluster
 2 | By Yaroslav Bulatov and Andrew Shaw
 3 | 
 4 | ```
 5 | import ncluster
 6 | task = ncluster.make_task(instance_type='p2.xlarge')
 7 | task.upload('myscript.py')
 8 | task.run('python myscript.py > out')
 9 | task.download('out')
10 | ```
11 | 
12 | ## Installation
13 | Install pip, tmux, Python 3.6 (see below), then
14 | 
15 | ```
16 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt
17 | pip install ncluster
18 | ```
19 | 
20 | ### Extra
21 | An example of installing pip/tmux/python 3.6 on MacOS
22 | 
23 | 1. Download Anaconda distribution following https://conda.io/docs/user-guide/install/index.html
24 | 2. Install tmux through homebrew: https://brew.sh/, then `brew install tmux`
25 | 
26 | Then
27 | 
28 | ```
29 | conda create -n new python=3.6 -y
30 | conda activate new
31 | ```
32 | 
33 | Extra Deps:
34 | ```
35 | brew install fswatch
36 | ```
37 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | Benchmarks
 2 | 
 3 | ```
 4 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt
 5 | pip install ncluster
 6 | python <somebenchmark.py>
 7 | ```
 8 | 
 9 | 
10 | # Debugging
11 | ```
12 | export NCLUSTER_INSTANCE=c5.18xlarge
13 | export NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE=1
14 | ```
15 | 


--------------------------------------------------------------------------------
/benchmarks/mpi_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Running locally
  5 | 
  6 | 004/11 sent 100 MBs in 28.4 ms: 3519.33 MB/second
  7 | 005/11 sent 100 MBs in 25.1 ms: 3988.50 MB/second
  8 | 006/11 sent 100 MBs in 25.5 ms: 3918.33 MB/second
  9 | 007/11 sent 100 MBs in 25.3 ms: 3958.61 MB/second
 10 | 008/11 sent 100 MBs in 25.3 ms: 3954.15 MB/second
 11 | 009/11 sent 100 MBs in 24.9 ms: 4009.78 MB/second
 12 | 010/11 sent 100 MBs in 25.0 ms: 3992.75 MB/second
 13 | min:    24.94, median:    25.52, mean:    29.53
 14 | 
 15 | 
 16 | """
 17 | 
 18 | import argparse
 19 | import json
 20 | import os
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | import time
 24 | 
 25 | import util
 26 | 
 27 | parser = argparse.ArgumentParser()
 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 29 | parser.add_argument("--iters", default=11, type=int,
 30 |                     help="Maximum number of additions")
 31 | parser.add_argument("--size-mb", default=100, type=int,
 32 |                     help="size of vector in MBs")
 33 | parser.add_argument("--shards", default=1, type=int,
 34 |                     help="how many ways to shard the variable")
 35 | parser.add_argument('--image',
 36 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 37 | parser.add_argument('--name',
 38 |                     default='mpi')
 39 | 
 40 | # internal flags
 41 | parser.add_argument('--role', default='launcher', type=str)
 42 | args = parser.parse_args()
 43 | 
 44 | 
 45 | def run_launcher():
 46 |   import ncluster
 47 |   if args.aws:
 48 |     ncluster.set_backend('aws')
 49 | 
 50 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
 51 |   job.upload(__file__)
 52 |   job.upload('util.py')
 53 | 
 54 |   # kill python just for when tmux session reuse is on
 55 |   if not ncluster.running_locally():
 56 |     job._run_raw('killall python', ignore_errors=True)
 57 | 
 58 |   if ncluster.get_backend() == 'aws':
 59 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 60 |     job.run('source activate tensorflow_p36')
 61 | 
 62 |   
 63 |   hosts = [task.public_ip for task in job.tasks]
 64 |   host_str = ','.join(hosts)
 65 |   os.system(f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
 66 |   print(job.tasks[0].read('/tmp/out'))
 67 | 
 68 | 
 69 | def run_worker():
 70 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
 71 | 
 72 |   from mpi4py import MPI
 73 |   comm = MPI.COMM_WORLD
 74 |   rank = comm.Get_rank()
 75 | 
 76 |   if rank == 0:
 77 |     log = util.FileLogger('/tmp/out')
 78 |     #    log = util.FileLogger('/dev/null', mirror=False)
 79 | 
 80 |   else:
 81 |     log = util.FileLogger('/dev/null', mirror=False)
 82 |   grads_array = []
 83 | 
 84 |   time_list = []
 85 |   dim = args.size_mb*250*1000
 86 |   dtype = np.float32
 87 |   data = np.ones(dim, dtype=dtype)*(rank+1)
 88 |   for i in range(args.iters):
 89 |     start_time = time.perf_counter()
 90 |     if rank == 0:
 91 |       comm.Send(data, dest=1, tag=13)
 92 |     else:
 93 |       data = np.empty(dim, dtype=dtype)
 94 |       comm.Recv(data, source=0, tag=13)
 95 |       
 96 |     end_time = time.perf_counter()
 97 |     
 98 |     elapsed_time_ms = (end_time - start_time) * 1000
 99 |     time_list.append(elapsed_time_ms)
100 |     rate = args.size_mb / (elapsed_time_ms / 1000)
101 |     log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}'
102 |         f' ms: {rate:.2f} MB/second')
103 | 
104 |   min = np.min(time_list)
105 |   median = np.median(time_list)
106 | 
107 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
108 | 
109 | 
110 | def main():
111 |   # run local benchmark in launcher and launch service
112 |   if args.role == "launcher":
113 |     run_launcher()
114 |   elif args.role == "worker":
115 |     run_worker()
116 |   else:
117 |     assert False, 'unknown role'
118 | 
119 | 
120 | if __name__ == '__main__':
121 |   main()
122 | 


--------------------------------------------------------------------------------
/benchmarks/pytorch_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Run locally:
  4 | # ./pytorch_p2p.py
  5 | # 000/10 added 100 MBs in 35.0 ms: 2854.88 MB/second
  6 | # 001/10 added 100 MBs in 25.1 ms: 3979.37 MB/second
  7 | # 002/10 added 100 MBs in 25.4 ms: 3935.73 MB/second
  8 | # 003/10 added 100 MBs in 24.7 ms: 4040.93 MB/second
  9 | # 004/10 added 100 MBs in 24.4 ms: 4097.57 MB/second
 10 | # min:    21.58, median:    24.97, mean:    25.61
 11 | 
 12 | # To run on AWS:
 13 | # export NCLUSTER_IMAGE='Deep Learning AMI (Ubuntu) Version 15.0'
 14 | # export NCLUSTER_INSTANCE=c5.18xlarge
 15 | # python pytorch_p2p.py --aws
 16 | # 990/1000 added 100 MBs in 83.7 ms: 1194.35 MB/second
 17 | # 991/1000 added 100 MBs in 83.4 ms: 1198.78 MB/second
 18 | # 992/1000 added 100 MBs in 83.4 ms: 1198.73 MB/second
 19 | # 993/1000 added 100 MBs in 83.3 ms: 1201.20 MB/second
 20 | # 994/1000 added 100 MBs in 83.1 ms: 1203.84 MB/second
 21 | # 995/1000 added 100 MBs in 83.1 ms: 1203.04 MB/second
 22 | # 996/1000 added 100 MBs in 83.5 ms: 1197.38 MB/second
 23 | # 997/1000 added 100 MBs in 82.4 ms: 1213.99 MB/second
 24 | # 998/1000 added 100 MBs in 84.2 ms: 1187.69 MB/second
 25 | # 999/1000 added 100 MBs in 83.0 ms: 1204.13 MB/second
 26 | # min:    80.52, median:    83.25, mean:    83.29
 27 | 
 28 | import os
 29 | import sys
 30 | import time
 31 | import argparse
 32 | import util
 33 | 
 34 | parser = argparse.ArgumentParser(description='launch')
 35 | 
 36 | # launcher flags
 37 | parser.add_argument('--name', type=str, default='pytorch_two_machines',
 38 |                      help="name of the current run")
 39 | parser.add_argument('--size-mb', type=int, default=100,
 40 |                     help='size of data to send')
 41 | parser.add_argument('--iters', type=int, default=10,
 42 |                     help='how many iterations')
 43 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 44 | parser.add_argument('--image',
 45 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 46 | 
 47 | 
 48 | # mpi flags
 49 | parser.add_argument('--role', type=str, default='launcher',
 50 |                     help='internal flag, launcher or worker')
 51 | parser.add_argument('--rank', type=int, default=0,
 52 |                     help='mpi rank')
 53 | parser.add_argument('--size', type=int, default=0,
 54 |                     help='size of mpi world')
 55 | parser.add_argument('--master-addr', type=str, default='127.0.0.1',
 56 |                     help='address of master node')
 57 | parser.add_argument('--master-port', type=int, default=6006,
 58 |                     help='port of master node')
 59 | args = parser.parse_args()
 60 | 
 61 | def worker():
 62 |   """ Initialize the distributed environment. """
 63 | 
 64 |   import torch
 65 |   import torch.distributed as dist
 66 |   from torch.multiprocessing import Process
 67 |   import numpy as np
 68 | 
 69 |   print("Initializing distributed pytorch")
 70 |   os.environ['MASTER_ADDR'] = str(args.master_addr)
 71 |   os.environ['MASTER_PORT'] = str(args.master_port)
 72 |   # Use TCP backend. Gloo needs nightly, where it currently fails with
 73 |   #     dist.init_process_group('gloo', rank=args.rank,
 74 |   #   AttributeError: module 'torch.distributed' has no attribute 'init_process_group'
 75 |   dist.init_process_group('tcp', rank=args.rank,
 76 |                           world_size=args.size)
 77 | 
 78 |   tensor = torch.ones(args.size_mb*250*1000)*(args.rank+1)
 79 |   time_list = []
 80 |   outfile = 'out' if args.rank == 0 else '/dev/null'
 81 |   log = util.FileLogger(outfile)
 82 |   for i in range(args.iters):
 83 |     # print('before: rank ', args.rank, ' has data ', tensor[0])
 84 | 
 85 |     start_time = time.perf_counter()
 86 |     if args.rank == 0:
 87 |       dist.send(tensor=tensor, dst=1)
 88 |     else:
 89 |       dist.recv(tensor=tensor, src=0)
 90 |       
 91 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
 92 |     time_list.append(elapsed_time_ms)
 93 |     # print('after: rank ', args.rank, ' has data ', tensor[0])
 94 |     rate = args.size_mb/(elapsed_time_ms/1000)
 95 | 
 96 |     log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
 97 | 
 98 |   min = np.min(time_list)
 99 |   median = np.median(time_list)
100 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
101 | 
102 | 
103 | def launcher():
104 |   import ncluster
105 |   
106 |   if args.aws:
107 |     ncluster.set_backend('aws')
108 | 
109 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
110 |   job.upload(__file__)
111 |   job.upload('util.py')
112 | 
113 |   if args.aws:
114 |     job.run('source activate pytorch_p36')
115 |   else:
116 |     job.run('source deactivate')
117 |     job.run('source activate ncluster-test3')
118 | 
119 |   script_name = os.path.basename(__file__)
120 |   common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
121 |   job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args,
122 |                    non_blocking=True)
123 |   job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args,
124 |                    non_blocking=True)
125 | 
126 |   job.tasks[0].join()
127 |   print(job.tasks[0].read('out'))
128 |     
129 | 
130 | def main():
131 |   if args.role == "launcher":
132 |     launcher()
133 |   elif args.role == "worker":
134 |     worker()
135 |   else:
136 |     assert False, "Unknown role "+FLAGS.role
137 | 
138 |   
139 | if __name__ == "__main__":
140 |   main()
141 | 


--------------------------------------------------------------------------------
/benchmarks/ray_ps.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Ray parameter server benchmark
  4 | #
  5 | # python ray_ps.py --aws --num-ps=1 --num-workers=1 --size-mb=100 --iters=100
  6 | 
  7 | # # 1 worker, 1 ps
  8 | # min:    61.61, median:    63.77, mean:    69.20
  9 | 
 10 | # # 1 worker, 2 ps
 11 | # python ray_ps.py --aws --num-ps=2 --num-workers=1 --size-mb=100 --iters=100
 12 | # min:    49.45, median:    50.91, mean:    58.92
 13 | 
 14 | # # 1 worker, 4 ps
 15 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=100 --iters=100
 16 | # min:    47.98, median:    50.71, mean:    59.05
 17 | 
 18 | # # 4 worker, 4 ps
 19 | # python ray_ps.py --aws --num-ps=4 --num-workers=4 --size-mb=100 --iters=100
 20 | # 098/100 sent 400 MBs in 238.5 ms: 419.28 MB/second
 21 | # 099/100 sent 400 MBs in 242.0 ms: 413.22 MB/second
 22 | # min:   219.90, median:   241.51, mean:   245.95
 23 | # (54ms per worker since 4x more work done)
 24 | 
 25 | # # 1 worker, 4 ps, larger arrays
 26 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=800 --iters=100
 27 | # min:   358.35, median:   544.59, mean:   513.47
 28 | #
 29 | # Bottom line, 50-60ms to send 100MB regardless of sharding/workers
 30 | 
 31 | import argparse
 32 | import os
 33 | import socket
 34 | import subprocess
 35 | import time
 36 | 
 37 | import numpy as np
 38 | import ray
 39 | 
 40 | import util
 41 | 
 42 | parser = argparse.ArgumentParser()
 43 | parser.add_argument("--role", default='launcher', type=str,
 44 |                     help="launcher/driver")
 45 | parser.add_argument('--image',
 46 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 47 | parser.add_argument("--size-mb", default=10, type=int,
 48 |                     help='how much data to send at each iteration')
 49 | parser.add_argument("--num-workers", default=2, type=int)
 50 | parser.add_argument("--num-ps", default=2, type=int)
 51 | 
 52 | parser.add_argument("--iters", default=11, type=int)
 53 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 54 | parser.add_argument("--xray", default=1, type=int,
 55 |                     help="whether to use XRay backend")
 56 | parser.add_argument('--nightly', default=1, type=int,
 57 |                     help='whether to use nightly version')
 58 | parser.add_argument('--name', default='ray_ps', type=str,
 59 |                     help='name of the run')
 60 | parser.add_argument("--ip", default='', type=str,
 61 |                     help="internal flag, used to point worker to head node")
 62 | args = parser.parse_args()
 63 | 
 64 | dim = args.size_mb * 250 * 1000 // args.num_ps
 65 | 
 66 | 
 67 | @ray.remote(resources={"worker": 1})
 68 | class Worker(object):
 69 |   def __init__(self):
 70 |     self.gradients = np.ones(dim, dtype=np.float32)
 71 | 
 72 |   @ray.method(num_return_vals=args.num_ps)
 73 |   def compute_gradients(self):
 74 |     if args.num_ps == 1:
 75 |       return self.gradients
 76 |     return [self.gradients]*args.num_ps
 77 | 
 78 |   def ip(self):
 79 |     return ray.services.get_node_ip_address()
 80 | 
 81 | 
 82 | @ray.remote(resources={"worker": 1})
 83 | class ParameterServer(object):
 84 |   def __init__(self):
 85 |     self.params = np.zeros(dim, dtype=np.float32)
 86 | 
 87 |   def receive(self, *grad_list):
 88 |     for grad in grad_list:
 89 |       self.params = grad  # use = just to get network overhead
 90 |     return self.params
 91 | 
 92 |   def get_weights(self):
 93 |     return self.params
 94 | 
 95 |   def ip(self):
 96 |     return ray.services.get_node_ip_address()
 97 | 
 98 | 
 99 | 
100 | def run_launcher():
101 |   import ncluster
102 | 
103 |   if args.aws:
104 |     ncluster.set_backend('aws')
105 | 
106 |   if args.nightly:
107 |     # running locally MacOS
108 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
109 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
110 |     else:
111 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
112 |   else:
113 |     install_script = 'pip install ray'
114 | 
115 |   job = ncluster.make_job(name=args.name,
116 |                           install_script=install_script,
117 |                           image_name=args.image,
118 |                           num_tasks=args.num_workers+args.num_ps)
119 |   if not ncluster.running_locally():
120 |     job._run_raw('killall python', ignore_errors=True)
121 |   
122 |   job.upload(__file__)
123 |   job.upload('util.py')
124 |   if args.xray:
125 |     job.run('export RAY_USE_XRAY=1')
126 |   job.run('ray stop')
127 | 
128 |   head = job.tasks[0]
129 | 
130 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
131 |   worker_resource = """--resources='{"worker": 1}'"""
132 |   head.run(f"ray start --head {worker_resource} --redis-port=6379")
133 | 
134 |   for task in job.tasks[1:]:
135 |     task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")
136 |   
137 |   head.run(f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}')
138 |   
139 |   print(head.read('out'))
140 | 
141 | 
142 | def transpose(list_of_lists):
143 |   return list(map(list, zip(*list_of_lists)))
144 | 
145 | 
146 | def run_driver():
147 |   ray.init(redis_address=args.ip)
148 | 
149 |   worker_actors = [Worker.remote() for _ in range(args.num_workers)]
150 |   ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)]
151 |   
152 |   log = util.FileLogger('out')
153 | 
154 |   time_list = []
155 |   for i in range(args.iters):
156 |     start_time = time.perf_counter()
157 |     grads_list = []
158 |     for actor in worker_actors:
159 |       result = actor.compute_gradients.remote()
160 |       if args.num_ps == 1:
161 |         grads_list.append([result])
162 |       else:
163 |         grads_list.append(result)
164 |     
165 |     updates = []
166 |     for ps, shards in zip(ps_actors, transpose(grads_list)):
167 |       updates.append(ps.receive.remote(*shards))
168 |     
169 |     ray.wait(updates, num_returns=args.num_ps)
170 |     
171 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
172 |     time_list.append(elapsed_time_ms)
173 |     rate = args.size_mb / (elapsed_time_ms/1000)
174 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb*args.num_workers, elapsed_time_ms, rate))
175 |     
176 |   min = np.min(time_list)
177 |   median = np.median(time_list)
178 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
179 | 
180 | 
181 | def main():
182 |   if args.role == 'launcher':
183 |     run_launcher()
184 |   elif args.role == 'driver':
185 |     run_driver()
186 |   else:
187 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
188 | 
189 | 
190 | if __name__ == '__main__':
191 |   main()
192 | 


--------------------------------------------------------------------------------
/benchmarks/ray_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Example of two process Ray program, worker sends values to parameter
  4 | # server on a different machine
  5 | #
  6 | # Run locally:
  7 | # ./ray_two_machines.py
  8 | #
  9 | # Run on AWS:
 10 | # ./ray_two_machines.py --aws
 11 | 
 12 | 
 13 | # Example timings
 14 | # c5.18xlarge over network: over network: 63.0 ms: 1586.76 MB/second
 15 | # c5.9xlarge over network: 399/400 added 100 MBs in 85.5 ms: 1170.26 MB/second
 16 | # c5.18xlarge locally: 86 ms, 1218 MB/seconds (9.7 Gbps)
 17 | # macbook pro locally: 978.9 ms, 102.15 MB/second
 18 | 
 19 | # c5.18xlarge
 20 | # 004/11 sent 100 MBs in 69.4 ms: 1440.31 MB/second
 21 | # 005/11 sent 100 MBs in 68.1 ms: 1468.95 MB/second
 22 | # 006/11 sent 100 MBs in 70.4 ms: 1421.40 MB/second
 23 | # 007/11 sent 100 MBs in 69.5 ms: 1438.62 MB/second
 24 | # 008/11 sent 100 MBs in 66.4 ms: 1506.90 MB/second
 25 | # 009/11 sent 100 MBs in 76.5 ms: 1306.92 MB/second
 26 | # 010/11 sent 100 MBs in 66.8 ms: 1497.64 MB/second
 27 | # min:    66.36, median:    69.43, mean:    70.55
 28 | 
 29 | # Another run
 30 | # 989/1000 sent 100 MBs in 54.6 ms: 1831.07 MB/second
 31 | # 990/1000 sent 100 MBs in 54.4 ms: 1837.20 MB/second
 32 | # 991/1000 sent 100 MBs in 54.8 ms: 1824.91 MB/second
 33 | # 992/1000 sent 100 MBs in 53.4 ms: 1874.39 MB/second
 34 | # 993/1000 sent 100 MBs in 53.1 ms: 1881.77 MB/second
 35 | # 994/1000 sent 100 MBs in 52.7 ms: 1897.76 MB/second
 36 | # 995/1000 sent 100 MBs in 55.4 ms: 1805.42 MB/second
 37 | # 996/1000 sent 100 MBs in 53.4 ms: 1872.93 MB/second
 38 | # 997/1000 sent 100 MBs in 52.7 ms: 1896.65 MB/second
 39 | # 998/1000 sent 100 MBs in 54.0 ms: 1851.14 MB/second
 40 | # 999/1000 sent 100 MBs in 53.6 ms: 1864.93 MB/second
 41 | # min:    51.11, median:    55.45, mean:    60.74
 42 | 
 43 | 
 44 | # Bottom line: 30ms locally, 60ms over network
 45 | 
 46 | import argparse
 47 | import os
 48 | import socket
 49 | import subprocess
 50 | import time
 51 | 
 52 | import numpy as np
 53 | import ray
 54 | 
 55 | import util
 56 | 
 57 | parser = argparse.ArgumentParser()
 58 | parser.add_argument("--role", default='launcher', type=str,
 59 |                     help="launcher/driver")
 60 | parser.add_argument('--image',
 61 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 62 | parser.add_argument("--size-mb", default=100, type=int,
 63 |                     help='how much data to send at each iteration')
 64 | parser.add_argument("--iters", default=11, type=int)
 65 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 66 | parser.add_argument("--xray", default=1, type=int,
 67 |                     help="whether to use XRay backend")
 68 | parser.add_argument('--nightly', default=1, type=int,
 69 |                     help='whether to use nightly version')
 70 | parser.add_argument('--name', default='ray_two_machines', type=str,
 71 |                     help='name of the run')
 72 | parser.add_argument("--ip", default='', type=str,
 73 |                     help="internal flag, used to point worker to head node")
 74 | args = parser.parse_args()
 75 | 
 76 | dim = args.size_mb * 250 * 1000
 77 | 
 78 | 
 79 | @ray.remote(resources={"worker": 1})
 80 | class Worker(object):
 81 |   def __init__(self):
 82 |     self.gradients = np.ones(dim, dtype=np.float32)
 83 | 
 84 |   def compute_gradients(self):
 85 |     return self.gradients
 86 | 
 87 |   def ip(self):
 88 |     return ray.services.get_node_ip_address()
 89 | 
 90 | 
 91 | @ray.remote(resources={"ps": 1})
 92 | class ParameterServer(object):
 93 |   def __init__(self):
 94 |     self.params = np.zeros(dim, dtype=np.float32)
 95 | 
 96 |   def receive(self, grad):
 97 |     self.params = grad  # use = just to get network overhead
 98 |     return self.params
 99 | 
100 |   def get_weights(self):
101 |     return self.params
102 | 
103 |   def ip(self):
104 |     return ray.services.get_node_ip_address()
105 | 
106 | 
107 | 
108 | def run_launcher():
109 |   import ncluster
110 | 
111 |   if args.aws:
112 |     ncluster.set_backend('aws')
113 | 
114 |   if args.nightly:
115 |     # running locally MacOS
116 |     print(f"asdfasdf {util.ossystem('uname')}")
117 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
118 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
119 |       print(f"asdfasdf got install script {install_script}")
120 |     else:
121 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
122 |   else:
123 |     install_script = 'pip install ray'
124 | 
125 |   job = ncluster.make_job(name=args.name,
126 |                           install_script=install_script,
127 |                           image_name=args.image,
128 |                           num_tasks=2)
129 |   ps, worker = job.tasks
130 |   if not ncluster.running_locally():
131 |     ps._run_raw('killall python', ignore_errors=True)
132 |     worker._run_raw('killall python', ignore_errors=True)
133 |   
134 |   job.upload(__file__)
135 |   job.upload('util.py')
136 |   if args.xray:
137 |     job.run('export RAY_USE_XRAY=1')
138 |   job.run('ray stop')
139 | 
140 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
141 |   ps_resource = """--resources='{"ps": 1}'"""
142 |   worker_resource = """--resources='{"worker": 1}'"""
143 |   
144 |   ps.run(f"ray start --head {ps_resource} --redis-port=6379")
145 |   worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
146 |   worker.run(
147 |     f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
148 |   print(worker.read('out'))
149 | 
150 | 
151 | def run_driver():
152 |   ray.init(redis_address=args.ip)
153 | 
154 |   worker = Worker.remote()
155 |   ps = ParameterServer.remote()
156 |   log = util.FileLogger('out')
157 |   log(f"Worker ip {ray.get(worker.ip.remote())}")
158 |   log(f"PS ip {ray.get(ps.ip.remote())}")
159 |   log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")
160 | 
161 |   time_list = []
162 |   for i in range(args.iters):
163 |     start_time = time.perf_counter()
164 |     grads = worker.compute_gradients.remote()
165 |     result = ps.receive.remote(grads)
166 |     ray.wait([result])
167 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
168 |     time_list.append(elapsed_time_ms)
169 |     rate = args.size_mb / (elapsed_time_ms/1000)
170 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
171 |     
172 |   min = np.min(time_list)
173 |   median = np.median(time_list)
174 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
175 | 
176 | 
177 | def main():
178 |   if args.role == 'launcher':
179 |     run_launcher()
180 |   elif args.role == 'driver':
181 |     run_driver()
182 |   else:
183 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
184 | 
185 | 
186 | if __name__ == '__main__':
187 |   main()
188 | 


--------------------------------------------------------------------------------
/benchmarks/ray_two_machines_local.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Runs two machine benchmark locally on AWS machine
  4 | #
  5 | # Example timings
  6 | # macbook: added 10 MBs in 14.1 ms: 707.68 MB/second
  7 | # c5.18xlarge: added 10 MBs in 4.4 ms: 2298.82 MB/second
  8 | #      091/100 added 100 MBs in 30.8 ms: 3246.44 MB/second
  9 | 
 10 | # Bottom line: can do 3.2 GB/second running locally, 800 
 11 | import argparse
 12 | import os
 13 | import socket
 14 | import subprocess
 15 | import time
 16 | 
 17 | import numpy as np
 18 | import ray
 19 | 
 20 | import util
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument('--image',
 24 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 25 | parser.add_argument("--size-mb", default=100, type=int,
 26 |                     help='how much data to send at each iteration')
 27 | parser.add_argument("--iters", default=11, type=int)
 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 29 | parser.add_argument("--xray", default=1, type=int,
 30 |                     help="whether to use XRay backend")
 31 | parser.add_argument('--nightly', default=1, type=int,
 32 |                     help='whether to use nightly version')
 33 | parser.add_argument('--name', default='ray_two_machines', type=str,
 34 |                     help='name of the run')
 35 | 
 36 | parser.add_argument("--ip", default='', type=str,
 37 |                     help="internal flag, used to point worker to head node")
 38 | parser.add_argument("--role", default='launcher', type=str,
 39 |                     help="interanl flag, launcher/driver")
 40 | args = parser.parse_args()
 41 | 
 42 | dim = args.size_mb * 250 * 1000
 43 | 
 44 | 
 45 | @ray.remote(resources={"worker": 1})
 46 | class Worker(object):
 47 |   def __init__(self):
 48 |     self.gradients = np.ones(dim, dtype=np.float32)
 49 | 
 50 |   def compute_gradients(self):
 51 |     return self.gradients
 52 | 
 53 |   def ip(self):
 54 |     return ray.services.get_node_ip_address()
 55 | 
 56 | 
 57 | @ray.remote(resources={"ps": 1})
 58 | class ParameterServer(object):
 59 |   def __init__(self):
 60 |     self.params = np.zeros(dim, dtype=np.float32)
 61 | 
 62 |   def assign_add(self, grad):
 63 |     self.params = grad  # use = just to get network overhead
 64 |     return self.params
 65 | 
 66 |   def get_weights(self):
 67 |     return self.params
 68 | 
 69 |   def ip(self):
 70 |     return ray.services.get_node_ip_address()
 71 | 
 72 | 
 73 | 
 74 | def run_launcher():
 75 |   import ncluster
 76 | 
 77 |   if args.aws:
 78 |     ncluster.set_backend('aws')
 79 | 
 80 |   if args.nightly:
 81 |     # running locally MacOS
 82 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
 83 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
 84 |     else:
 85 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
 86 |   else:
 87 |     install_script = 'pip install ray'
 88 | 
 89 |   worker = ncluster.make_task(name=args.name,
 90 |                             install_script=install_script,
 91 |                             image_name=args.image)
 92 |   if not ncluster.running_locally():
 93 |     worker._run_raw('killall python', ignore_errors=True)
 94 |   worker.upload(__file__)
 95 |   worker.upload('util.py')
 96 |   if args.xray:
 97 |     worker.run('export RAY_USE_XRAY=1')
 98 |   worker.run('ray stop')
 99 | 
100 |   resources = """--resources='{"ps": 1, "worker": 1}'"""
101 |   worker.run(f"ray start --head {resources} --redis-port=6379")
102 |   #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
103 |   worker.run(
104 |     f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
105 |   print(worker.read('out'))
106 | 
107 | 
108 | def run_driver():
109 |   ray.init(redis_address=args.ip)
110 | 
111 |   worker = Worker.remote()
112 |   ps = ParameterServer.remote()
113 |   log = util.FileLogger('out')
114 |   log(f"Worker ip {ray.get(worker.ip.remote())}")
115 |   log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")
116 | 
117 |   time_list = []
118 |   for i in range(args.iters):
119 |     start_time = time.perf_counter()
120 |     grads = worker.compute_gradients.remote()
121 |     result = ps.assign_add.remote(grads)
122 |     result = ray.get(result)[0]
123 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
124 |     time_list.append(elapsed_time_ms)
125 |     rate = args.size_mb / (elapsed_time_ms/1000)
126 |     log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
127 |     
128 |   min = np.min(time_list)
129 |   median = np.median(time_list)
130 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
131 | 
132 | 
133 | def main():
134 |   if args.role == 'launcher':
135 |     run_launcher()
136 |   elif args.role == 'driver':
137 |     run_driver()
138 |   else:
139 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
140 | 
141 | 
142 | if __name__ == '__main__':
143 |   main()
144 | 


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | ray
2 | torch
3 | tensorflow
4 | 


--------------------------------------------------------------------------------
/benchmarks/summary.txt:
--------------------------------------------------------------------------------
 1 | tf_two_machines -- 500 on t3, 910 on c3
 2 | 
 3 | Ray can do
 4 | 30ms on local transfers, 60ms on AWS c5.18xlarge
 5 | Using multiple ps shards, can do 48ms on AWS
 6 | 
 7 | 
 8 | 40ms on unoptimized PyTorch clone
 9 | 2.7ms for optimized memcpy on skylake: 300 Gbps (37 GB/second, close to memory bandwidth) -- https://www.google.com/url?q=https://www.anandtech.com/show/11544/intel-skylake-ep-vs-amd-epyc-7000-cpu-battle-of-the-decade/12&source=gmail&ust=1537921524487000&usg=AFQjCNGUrAScjR_rAihauUr-nj5TMg-VKQ
10 | 
11 | 
12 | PyTorch backend can do 20 Gbps per thread on 
13 | 


--------------------------------------------------------------------------------
/benchmarks/tf_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver.
  6 | 
  7 | To run locally:
  8 | ./tf_two_machines.py
  9 | Should see something like this
 10 | 
 11 | ```
 12 | 005/11 added 100 MBs in 78.9 ms: 1266.98 MB/second
 13 | 006/11 added 100 MBs in 78.1 ms: 1280.07 MB/second
 14 | 007/11 added 100 MBs in 78.1 ms: 1280.56 MB/second
 15 | 008/11 added 100 MBs in 81.8 ms: 1222.76 MB/second
 16 | 009/11 added 100 MBs in 79.5 ms: 1258.54 MB/second
 17 | 010/11 added 100 MBs in 76.6 ms: 1305.64 MB/second
 18 | min:    76.59, median:    78.80, mean:    88.34
 19 | ```
 20 | 
 21 | To interact with task 1 (the driver), do "tmux a -t 1"
 22 | 
 23 | To run on AWS
 24 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 25 | ./tf_two_machines.py --aws
 26 | 
 27 | Should see something like this with t3.large instances
 28 | ```
 29 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second
 30 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second
 31 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second
 32 | ```
 33 | 
 34 | Running c5.18xlarge machines with more iterations
 35 | 007/11 sent 100 MBs in 135.4 ms: 738.47 MB/second
 36 | 008/11 sent 100 MBs in 133.0 ms: 752.04 MB/second
 37 | 009/11 sent 100 MBs in 133.8 ms: 747.48 MB/second
 38 | 010/11 sent 100 MBs in 136.3 ms: 733.77 MB/second
 39 | min:   132.97, median:   134.98, mean:   137.27
 40 | 
 41 | 
 42 | Can use more shards
 43 | ./tf_two_machines.py --aws --shards=8 --iters=1000
 44 | 994/1000 sent 100 MBs in 87.0 ms: 1149.50 MB/second
 45 | 995/1000 sent 100 MBs in 87.0 ms: 1149.21 MB/second
 46 | 996/1000 sent 100 MBs in 86.8 ms: 1152.11 MB/second
 47 | 997/1000 sent 100 MBs in 89.8 ms: 1113.89 MB/second
 48 | 998/1000 sent 100 MBs in 87.9 ms: 1137.37 MB/second
 49 | 999/1000 sent 100 MBs in 88.0 ms: 1135.80 MB/second
 50 | min:    86.12, median:    88.48, mean:    89.51
 51 | 
 52 | 
 53 | To connect and interact with the job look for SSH instructions like this
 54 |    To connect to 0.tf_two_machines
 55 |    ssh -i /Users/yaroslav/.ncluster/ncluster2-yaroslav-316880547378-us-east-1.pem -o StrictHostKeyChecking=no ubuntu@18.234.30.222
 56 | 
 57 | ssh into the instance following these instructions, then run "tmux a"
 58 | 
 59 | 
 60 | """
 61 | 
 62 | import argparse
 63 | import json
 64 | import os
 65 | import numpy as np
 66 | import tensorflow as tf
 67 | import time
 68 | 
 69 | import util
 70 | 
 71 | parser = argparse.ArgumentParser()
 72 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 73 | parser.add_argument("--iters", default=11, type=int,
 74 |                     help="Maximum number of additions")
 75 | parser.add_argument("--size-mb", default=100, type=int,
 76 |                     help="size of vector in MBs")
 77 | parser.add_argument("--shards", default=1, type=int,
 78 |                     help="how many ways to shard the variable")
 79 | parser.add_argument('--image',
 80 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 81 | parser.add_argument('--name',
 82 |                     default='tf_two_machines')
 83 | 
 84 | # internal flags
 85 | parser.add_argument('--role', default='launcher', type=str)
 86 | parser.add_argument("--sender-ip", default='127.0.0.1')
 87 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 88 | args = parser.parse_args()
 89 | 
 90 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 91 |                 'receiver': [args.receiver_ip + ':32301']}
 92 | 
 93 | 
 94 | def _launch_server(role):
 95 |   os.environ['TF_CONFIG'] = json.dumps(
 96 |     {'cluster': cluster_spec,
 97 |      'task': {'type': role, 'index': 0}})
 98 |   config = tf.estimator.RunConfig()
 99 |   return tf.train.Server(config.cluster_spec,
100 |                          job_name=config.task_type,
101 |                          task_index=config.task_id)
102 | 
103 | 
104 | def run_launcher():
105 |   import ncluster
106 |   if args.aws:
107 |     ncluster.set_backend('aws')
108 | 
109 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
110 |   job.upload(__file__)
111 |   job.upload('util.py')
112 | 
113 |   sender, receiver = job.tasks
114 |   # kill python just for when tmux session reuse is on
115 |   if not ncluster.running_locally():
116 |     sender._run_raw('killall python', ignore_errors=True)
117 |     receiver._run_raw('killall python', ignore_errors=True)
118 | 
119 |   if ncluster.get_backend() == 'aws':
120 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
121 |     job.run('source activate tensorflow_p36')
122 | 
123 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
124 |   receiver.run(f'python {__file__} --role=receiver {ip_config}',
125 |                non_blocking=True)
126 |   sender.run(
127 |     f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
128 |   print(sender.read('out'))
129 | 
130 | 
131 | def run_receiver():
132 |   server = _launch_server('receiver')
133 |   time.sleep(365 * 24 * 3600)
134 |   del server
135 | 
136 | 
137 | def run_sender():
138 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
139 |   log = util.FileLogger('out')
140 |   grads_array = []
141 |   with tf.device('/job:chief/task:0'):
142 |     #    grads = tf.fill([param_size], 1.)
143 |     for i in range(args.shards):
144 |       grads = tf.Variable(tf.ones([param_size]))
145 |       grads_array.append(grads)
146 | 
147 |   params_array = []
148 |   add_op_array = []
149 |   with tf.device('/job:receiver/task:0'):
150 |     for i in range(args.shards):
151 |       params = tf.Variable(tf.ones([param_size]))
152 |       add_op = params.assign(grads_array[i]).op
153 |       params_array.append(params)
154 |       add_op_array.append(add_op)
155 |     add_op = tf.group(*add_op_array)
156 |     
157 |   server = _launch_server('chief')
158 |   sess = tf.Session(server.target)
159 |   sess.run(tf.global_variables_initializer())
160 |     # except Exception as e:
161 |     #   # sometimes .run fails with .UnavailableError: OS Error
162 |     #   log(f"initialization failed with {e}, retrying in 1 second")
163 |     #   time.sleep(1)
164 | 
165 |   time_list = []
166 |   for i in range(args.iters):
167 |     start_time = time.perf_counter()
168 |     sess.run(add_op)
169 |     elapsed_time_ms = (time.perf_counter() - start_time) * 1000
170 |     time_list.append(elapsed_time_ms)
171 |     rate = args.size_mb / (elapsed_time_ms / 1000)
172 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (
173 |       i, args.iters, args.size_mb, elapsed_time_ms, rate))
174 | 
175 |   min = np.min(time_list)
176 |   median = np.median(time_list)
177 | 
178 |   log(
179 |     f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
180 | 
181 | 
182 | def main():
183 |   # run local benchmark in launcher and launch service
184 |   if args.role == "launcher":
185 |     run_launcher()
186 |   elif args.role == "sender":
187 |     run_sender()
188 |   elif args.role == "receiver":
189 |     run_receiver()
190 |   else:
191 |     assert False, 'unknown role'
192 | 
193 | 
194 | if __name__ == '__main__':
195 |   main()
196 | 


--------------------------------------------------------------------------------
/benchmarks/tf_two_machines_local.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Runs distributed benchmark on a single machine remotely
  4 | 
  5 | Adding 100MB buffers
  6 | 
  7 | # 1 shard: 88ms
  8 | # 4 shards: 56ms
  9 | # 8 shards: 51ms
 10 | # 16 shards: 55ms
 11 | 
 12 | # increase size 8x
 13 | python tf_two_machines_local.py --shards=8 --iters=100 --size-mb=800 --aws
 14 | # 416ms
 15 | 
 16 | Bottom line: 1.6-1.9 GB/second when running locally
 17 | Going 1->4 shards saves 30%, 4->8 shards another 5%
 18 | 
 19 | i3.metal 30% slower than c5.18xlarge
 20 | 
 21 | """
 22 | 
 23 | import argparse
 24 | import json
 25 | import os
 26 | import numpy as np
 27 | import tensorflow as tf
 28 | import time
 29 | 
 30 | import util
 31 | 
 32 | parser = argparse.ArgumentParser()
 33 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 34 | parser.add_argument("--iters", default=11, type=int,
 35 |                     help="Maximum number of additions")
 36 | parser.add_argument("--size-mb", default=100, type=int,
 37 |                     help="size of vector in MBs")
 38 | parser.add_argument("--shards", default=1, type=int,
 39 |                     help="how many ways to shard the variable")
 40 | parser.add_argument('--image',
 41 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 42 | parser.add_argument('--name',
 43 |                     default='tf_two_machines_local')
 44 | 
 45 | # internal flags
 46 | parser.add_argument('--role', default='launcher', type=str)
 47 | parser.add_argument("--sender-ip", default='127.0.0.1')
 48 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 49 | args = parser.parse_args()
 50 | 
 51 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 52 |                 'receiver': [args.receiver_ip + ':32301']}
 53 | 
 54 | 
 55 | def _launch_server(role):
 56 |   os.environ['TF_CONFIG'] = json.dumps(
 57 |     {'cluster': cluster_spec,
 58 |      'task': {'type': role, 'index': 0}})
 59 |   config = tf.estimator.RunConfig()
 60 |   return tf.train.Server(config.cluster_spec,
 61 |                          job_name=config.task_type,
 62 |                          task_index=config.task_id)
 63 | 
 64 | 
 65 | def run_launcher():
 66 |   import ncluster
 67 |   ncluster.util.assert_script_in_current_directory()
 68 |   
 69 |   if args.aws:
 70 |     ncluster.set_backend('aws')
 71 | 
 72 |   # use 4GB instance, 0.5GB not enough
 73 |   worker = ncluster.make_task(args.name, image_name=args.image,
 74 |                               instance_type='t3.medium')
 75 |   worker.upload(__file__)
 76 |   worker.upload('util.py')
 77 | 
 78 |   # kill python just for when tmux session reuse is on
 79 |   if not ncluster.running_locally():
 80 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 81 |     worker._run_raw('killall python', ignore_errors=True)
 82 |     worker.run('source activate tensorflow_p36')
 83 | 
 84 |   ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
 85 |   worker.run(f'python {__file__} --role=receiver {ip_config}',
 86 |                non_blocking=True)
 87 |   worker.switch_window(1)  # run in new tmux window
 88 |   if not ncluster.running_locally():
 89 |     worker.run('source activate tensorflow_p36')
 90 |   worker.run(
 91 |     f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
 92 |   print(worker.read('out'))
 93 | 
 94 | 
 95 | def run_receiver():
 96 |   server = _launch_server('receiver')
 97 |   time.sleep(365 * 24 * 3600)
 98 |   del server
 99 | 
100 | 
101 | def run_sender():
102 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
103 |   log = util.FileLogger('out')
104 |   grads_array = []
105 |   with tf.device('/job:chief/task:0'):
106 |     #    grads = tf.fill([param_size], 1.)
107 |     for i in range(args.shards):
108 |       grads = tf.Variable(tf.ones([param_size]))
109 |       grads_array.append(grads)
110 | 
111 |   params_array = []
112 |   add_op_array = []
113 |   with tf.device('/job:receiver/task:0'):
114 |     for i in range(args.shards):
115 |       params = tf.Variable(tf.ones([param_size]))
116 |       add_op = params.assign(grads_array[i]).op
117 |       params_array.append(params)
118 |       add_op_array.append(add_op)
119 |     add_op = tf.group(*add_op_array)
120 |     
121 |   server = _launch_server('chief')
122 |   sess = tf.Session(server.target)
123 |   sess.run(tf.global_variables_initializer())
124 |     # except Exception as e:
125 |     #   # sometimes .run fails with .UnavailableError: OS Error
126 |     #   log(f"initialization failed with {e}, retrying in 1 second")
127 |     #   time.sleep(1)
128 | 
129 |   time_list = []
130 |   for i in range(args.iters):
131 |     start_time = time.perf_counter()
132 |     sess.run(add_op)
133 |     elapsed_time_ms = (time.perf_counter() - start_time) * 1000
134 |     time_list.append(elapsed_time_ms)
135 |     rate = args.size_mb / (elapsed_time_ms / 1000)
136 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (
137 |       i, args.iters, args.size_mb, elapsed_time_ms, rate))
138 | 
139 |   min = np.min(time_list)
140 |   median = np.median(time_list)
141 | 
142 |   log(
143 |     f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
144 | 
145 | 
146 | def main():
147 |   # run local benchmark in launcher and launch service  
148 |   if args.role == "launcher":
149 |     run_launcher()
150 |   elif args.role == "sender":
151 |     run_sender()
152 |   elif args.role == "receiver":
153 |     run_receiver()
154 |   else:
155 |     assert False, 'unknown role'
156 | 
157 | 
158 | if __name__ == '__main__':
159 |   main()
160 | 


--------------------------------------------------------------------------------
/benchmarks/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | 
 5 | 
 6 | class FileLogger:
 7 |   """Helper class to log to file (possibly mirroring to stderr)
 8 |      logger = FileLogger('somefile.txt')
 9 |      logger = FileLogger('somefile.txt', mirror=True)
10 |      logger('somemessage')
11 |      logger('somemessage: %s %.2f', 'value', 2.5)
12 |   """
13 | 
14 |   def __init__(self, fn, mirror=True):
15 |     self.fn = fn
16 |     self.f = open(fn, 'w')
17 |     self.mirror = mirror
18 |     print(f"Creating FileLogger on {os.path.abspath(fn)}")
19 | 
20 |   def __call__(self, s='', *args):
21 |     """Either ('asdf %f', 5) or (val1, val2, val3, ...)"""
22 |     if (isinstance(s, str) or isinstance(s, bytes)) and '%' in s:
23 |       formatted_s = s % args
24 |     else:
25 |       toks = [s] + list(args)
26 |       formatted_s = ', '.join(str(s) for s in toks)
27 | 
28 |     self.f.write(formatted_s + '\n')
29 |     self.f.flush()
30 |     if self.mirror:
31 |       # use manual flushing because "|" makes output 4k buffered instead of
32 |       # line-buffered
33 |       sys.stdout.write(formatted_s+'\n')
34 |       sys.stdout.flush()
35 | 
36 |   def __del__(self):
37 |     self.f.close()
38 | 
39 | 
40 | def ossystem(cmd):
41 |   """Like os.system, but returns output of command as string."""
42 |   p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
43 |                        stderr=subprocess.STDOUT)
44 |   (stdout, stderr) = p.communicate()
45 |   return stdout.decode('ascii')
46 | 


--------------------------------------------------------------------------------
/examples/deleteme.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | print(sys.argv[0])
4 | 


--------------------------------------------------------------------------------
/examples/gpubox.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Launch a single GPU instance with jupyter notebook
 4 | 
 5 | import argparse
 6 | import os
 7 | import ncluster
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--name', type=str, default='gpubox',
11 |                     help="instance name")
12 | parser.add_argument('--image-name', type=str,
13 |                     default='Deep Learning AMI (Ubuntu) Version 15.0',
14 |                     help="name of AMI to use ")
15 | parser.add_argument('--instance-type', type=str, default='p3.2xlarge',
16 |                     help="type of instance")
17 | parser.add_argument('--password',
18 |                     default='DefaultNotebookPasswordPleaseChange',
19 |                     help='password to use for jupyter notebook')
20 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
21 | 
22 | args = parser.parse_args()
23 | module_path = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | if args.aws:
26 |   ncluster.set_backend('aws')
27 | 
28 | def main():
29 |   task = ncluster.make_task(name=args.name,
30 |                             instance_type=args.instance_type,
31 |                             image_name=args.image_name)
32 | 
33 |   # upload notebook config with provided password
34 |   jupyter_config_fn = _create_jupyter_config(args.password)
35 |   remote_config_fn = '~/.jupyter/jupyter_notebook_config.py'
36 |   task.upload(jupyter_config_fn, remote_config_fn)
37 | 
38 |   # upload sample notebook and start Jupyter server
39 |   task.run('mkdir -p /ncluster/notebooks')
40 |   task.upload(f'{module_path}/gpubox_sample.ipynb',
41 |               '/ncluster/notebooks/gpubox_sample.ipynb',
42 |               dont_overwrite=True)
43 |   task.run('cd /ncluster/notebooks')
44 |   task.run('jupyter notebook', non_blocking=True)
45 |   print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
46 | 
47 | 
48 | def _create_jupyter_config(password):
49 |   from notebook.auth import passwd
50 |   sha = passwd(args.password)
51 |   local_config_fn = f'{module_path}/gpubox_jupyter_notebook_config.py'
52 |   temp_config_fn = '/tmp/' + os.path.basename(local_config_fn)
53 |   os.system(f'cp {local_config_fn} {temp_config_fn}')
54 |   _replace_lines(temp_config_fn, 'c.NotebookApp.password',
55 |                  f"c.NotebookApp.password = '{sha}'")
56 |   return temp_config_fn
57 | 
58 | 
59 | def _replace_lines(fn, startswith, new_line):
60 |   """Replace lines starting with starts_with in fn with new_line."""
61 |   new_lines = []
62 |   for line in open(fn):
63 |     if line.startswith(startswith):
64 |       new_lines.append(new_line)
65 |     else:
66 |       new_lines.append(line)
67 |   with open(fn, 'w') as f:
68 |     f.write('\n'.join(new_lines))
69 | 
70 | 
71 | if __name__ == '__main__':
72 |   main()
73 | 


--------------------------------------------------------------------------------
/examples/gpubox_sample.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Mon Aug 13 23:41:40 2018       \r\n",
13 |       "+-----------------------------------------------------------------------------+\r\n",
14 |       "| NVIDIA-SMI 396.37                 Driver Version: 396.37                    |\r\n",
15 |       "|-------------------------------+----------------------+----------------------+\r\n",
16 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
17 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
18 |       "|===============================+======================+======================|\r\n",
19 |       "|   0  Tesla M60           On   | 00000000:00:1E.0 Off |                    0 |\r\n",
20 |       "| N/A   43C    P8    14W / 150W |      0MiB /  7618MiB |      0%      Default |\r\n",
21 |       "+-------------------------------+----------------------+----------------------+\r\n",
22 |       "                                                                               \r\n",
23 |       "+-----------------------------------------------------------------------------+\r\n",
24 |       "| Processes:                                                       GPU Memory |\r\n",
25 |       "|  GPU       PID   Type   Process name                             Usage      |\r\n",
26 |       "|=============================================================================|\r\n",
27 |       "|  No running processes found                                                 |\r\n",
28 |       "+-----------------------------------------------------------------------------+\r\n"
29 |      ]
30 |     }
31 |    ],
32 |    "source": [
33 |     "!nvidia-smi"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python [default]",
47 |    "language": "python",
48 |    "name": "python3"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 3
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython3",
60 |    "version": "3.6.4"
61 |   },
62 |   "toc": {
63 |    "colors": {
64 |     "hover_highlight": "#DAA520",
65 |     "running_highlight": "#FF0000",
66 |     "selected_highlight": "#FFD700"
67 |    },
68 |    "moveMenuLeft": true,
69 |    "nav_menu": {
70 |     "height": "12px",
71 |     "width": "252px"
72 |    },
73 |    "navigate_menu": true,
74 |    "number_sections": true,
75 |    "sideBar": true,
76 |    "threshold": 4,
77 |    "toc_cell": false,
78 |    "toc_section_display": "block",
79 |    "toc_window_display": false
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 2
84 | }
85 | 


--------------------------------------------------------------------------------
/examples/launch_16_instances.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import time
 3 | 
 4 | def main():
 5 |   ncluster.set_backend('aws')
 6 |   
 7 |   start_time = time.time()
 8 |   job = ncluster.make_job(num_tasks=16)
 9 |   print(f"waited for startup for {time.time()-start_time} seconds")
10 | 
11 |   start_time = time.time()
12 |   job.run('sleep 10')
13 |   print(f"waited for exec for {time.time()-start_time} seconds")
14 | 
15 | if __name__ == '__main__':
16 |   main()
17 | 


--------------------------------------------------------------------------------
/examples/ray_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Example of two process Ray program, worker sends values to parameter
  4 | # server on a different machine
  5 | #
  6 | # Run locally:
  7 | # ./ray_example.py
  8 | #
  9 | # Run on AWS:
 10 | # ./ray_example.py --aws
 11 | 
 12 | import argparse
 13 | import os
 14 | import time
 15 | 
 16 | import numpy as np
 17 | import ray
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument("--role", default='launcher', type=str,
 21 |                     help="launcher/driver")
 22 | parser.add_argument('--image', default='Deep Learning AMI (Ubuntu) Version 13.0')
 23 | parser.add_argument("--size-mb", default=10, type=int, help='how much data to send at each iteration')
 24 | parser.add_argument("--iters", default=10, type=int)
 25 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 26 | parser.add_argument("--ip", default='', type=str,
 27 |                     help="internal flag, used to point worker to head node")
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | dim = args.size_mb * 250 * 1000
 32 | 
 33 | 
 34 | @ray.remote(resources={"worker": 1})
 35 | class Worker(object):
 36 |   def __init__(self):
 37 |     self.gradients = np.ones(dim, dtype=np.float32)
 38 | 
 39 |   def compute_gradients(self):
 40 |     return self.gradients
 41 | 
 42 | 
 43 | @ray.remote(resources={"ps": 1})
 44 | class ParameterServer(object):
 45 |   def __init__(self):
 46 |     self.params = np.zeros(dim, dtype=np.float32)
 47 | 
 48 |   def assign_add(self, grad):
 49 |     self.params += grad
 50 |     return self.params
 51 | 
 52 |   def get_weights(self):
 53 |     return self.params
 54 | 
 55 | 
 56 | def run_launcher():
 57 |   import ncluster
 58 | 
 59 |   if args.aws:
 60 |     ncluster.set_backend('aws')
 61 | 
 62 |   script = os.path.basename(__file__)
 63 |   assert script in os.listdir('.')
 64 |   job = ncluster.make_job(install_script='pip install ray',
 65 |                           image_name=args.image,
 66 |                           instance_type='c5.large',
 67 |                           num_tasks=2)
 68 |   job.upload(script)
 69 |   job.run('export RAY_USE_XRAY=1')
 70 |   job.run('ray stop')
 71 | 
 72 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
 73 |   ps_resource = """--resources='{"ps": 1}'"""
 74 |   worker_resource = """--resources='{"worker": 1}'"""
 75 |   ps, worker = job.tasks
 76 |   ps.run(f"ray start --head {ps_resource} --redis-port=6379")
 77 |   worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
 78 |   worker.run(f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
 79 | 
 80 | 
 81 | def run_driver():
 82 |   ray.init(redis_address=args.ip)
 83 | 
 84 |   worker = Worker.remote()
 85 |   ps = ParameterServer.remote()
 86 | 
 87 |   for iteration in range(args.iters):
 88 |     start_time = time.time()
 89 |     grads = worker.compute_gradients.remote()
 90 |     result = ps.assign_add.remote(grads)
 91 |     result = ray.get(result)[0]
 92 |     elapsed_time = time.time() - start_time
 93 |     rate = args.size_mb / elapsed_time
 94 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (result, args.iters, args.size_mb, elapsed_time * 1000, rate))
 95 | 
 96 | 
 97 | def main():
 98 |   if args.role == 'launcher':
 99 |     run_launcher()
100 |   elif args.role == 'driver':
101 |     run_driver()
102 |   else:
103 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
104 | 
105 | 
106 | if __name__ == '__main__':
107 |   main()
108 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter     # for notebook.auth.passwd
2 | tensorflow
3 | torch
4 | ray
5 | 


--------------------------------------------------------------------------------
/examples/simple_job.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import time
 3 | 
 4 | def main():
 5 |   ncluster.set_backend('local')
 6 | 
 7 |   job = ncluster.make_job(num_tasks=2)
 8 | 
 9 |   start_time = time.time()
10 |   job.run('sleep 1')
11 |   print(f"waited for {time.time()-start_time} seconds")
12 | 
13 | if __name__ == '__main__':
14 |   main()
15 | 


--------------------------------------------------------------------------------
/examples/simple_task.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import ncluster
3 | 
4 | # allocate default machine type and default image
5 | task = ncluster.make_task()
6 | output = task.run('ifconfig')
7 | print(f"Task ifconfig returned {output}")
8 | 


--------------------------------------------------------------------------------
/examples/simple_tf.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | import sys
 3 | 
 4 | if not sys.argv[1:]:
 5 |   import ncluster
 6 |   task = ncluster.make_task(instance_type='t3.micro')
 7 |   task.upload(__file__)
 8 |   task.run('pip install tensorflow')
 9 |   task.run(f'python {__file__} worker')
10 | elif sys.argv[1] == 'worker':
11 |   import tensorflow as tf
12 |   import os
13 |   sess = tf.Session()
14 |   ones = tf.ones((1000,1000))
15 |   result = sess.run(tf.matmul(ones, ones))
16 |   print(f"matmul gave {result.sum()}")
17 |   os.system('sudo shutdown -h -P 10')  # shut down the instance in 10 mins
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/tf_adder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver.
  6 | 
  7 | To run locally:
  8 | ./tf_adder.py
  9 | tmux a -t 0
 10 | 
 11 | Should see something like this
 12 | ```
 13 | 089/100 added 100 MBs in 114.9 ms: 1114.36 MB/second
 14 | 090/100 added 100 MBs in 113.4 ms: 1128.61 MB/second
 15 | 091/100 added 100 MBs in 113.4 ms: 1128.60 MB/second
 16 | ```
 17 | 
 18 | 
 19 | To run on AWS
 20 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 21 | ./tf_adder.py --aws
 22 | nconnect 0.tf_adder
 23 | 
 24 | Should see something like this with t3.large instances
 25 | ```
 26 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second
 27 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second
 28 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second
 29 | ```
 30 | 
 31 | """
 32 | 
 33 | import argparse
 34 | import json
 35 | import os
 36 | import tensorflow as tf
 37 | import time
 38 | 
 39 | parser = argparse.ArgumentParser()
 40 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 41 | parser.add_argument("--iters", default=20, type=int, help="Maximum number of additions")
 42 | parser.add_argument("--data-mb", default=100, type=int, help="size of vector in MBs")
 43 | parser.add_argument('--image',
 44 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 45 | 
 46 | # internal flags
 47 | parser.add_argument('--role', default='launcher', type=str)
 48 | parser.add_argument("--sender-ip", default='127.0.0.1')
 49 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 50 | args = parser.parse_args()
 51 | 
 52 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 53 |                 'receiver': [args.receiver_ip + ':32301']}
 54 | 
 55 | 
 56 | def _launch_server(role):
 57 |   os.environ['TF_CONFIG'] = json.dumps(
 58 |     {'cluster': cluster_spec,
 59 |      'task': {'type': role, 'index': 0}})
 60 |   config = tf.estimator.RunConfig()
 61 |   return tf.train.Server(config.cluster_spec,
 62 |                          job_name=config.task_type,
 63 |                          task_index=config.task_id)
 64 | 
 65 | 
 66 | def run_launcher():
 67 |   import ncluster
 68 |   if args.aws:
 69 |     ncluster.set_backend('aws')
 70 | 
 71 |   job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
 72 |   job.upload(__file__)
 73 |   
 74 |   sender, receiver = job.tasks
 75 |   if ncluster.get_backend() == 'aws':
 76 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 77 |     job.run('source activate tensorflow_p36')
 78 | 
 79 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
 80 |   receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
 81 |                non_blocking=True)
 82 |   sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
 83 | 
 84 | 
 85 | def run_receiver():
 86 |   server = _launch_server('receiver')
 87 |   time.sleep(365 * 24 * 3600)
 88 |   del server
 89 | 
 90 | 
 91 | def run_sender():
 92 |   param_size = 250 * 1000 * args.data_mb  # 1MB is 250k integers
 93 |   with tf.device('/job:chief/task:0'):
 94 |     grads = tf.fill([param_size], 1.)
 95 | 
 96 |   with tf.device('/job:receiver/task:0'):
 97 |     params = tf.Variable(tf.ones([param_size]))
 98 |     add_op = params.assign_add(grads).op
 99 | 
100 |   server = _launch_server('chief')
101 |   sess = tf.Session(server.target)
102 | 
103 |   sess.run(tf.global_variables_initializer())
104 | 
105 |   for i in range(args.iters):
106 |     start_time = time.time()
107 |     sess.run(add_op)
108 |     elapsed_time = time.time() - start_time
109 |     rate = args.data_mb / elapsed_time
110 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate))
111 | 
112 | 
113 | def main():
114 |   # run local benchmark in launcher and launch service
115 |   if args.role == "launcher":
116 |     run_launcher()
117 |   elif args.role == "sender":
118 |     run_sender()
119 |   elif args.role == "receiver":
120 |     run_receiver()
121 |   else:
122 |     assert False, 'unknown role'
123 | 
124 | 
125 | if __name__ == '__main__':
126 |   main()
127 | 


--------------------------------------------------------------------------------
/examples/tf_adder_tb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark + TensorBoard. Create sender/receiver tasks and add arrays from sender tasks to
  6 | variable on receiver.
  7 | 
  8 | To run locally:
  9 | ./tf_adder_tb.py
 10 | 
 11 | Should see something like this
 12 | ```
 13 | ...
 14 | Benchmark done, tensorboard at http://127.0.0.1:6006
 15 | ```
 16 | 
 17 | 
 18 | To run on AWS
 19 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 20 | 
 21 | ./tf_adder_tb.py --aws
 22 | 
 23 | After a minute should see something like this
 24 | 
 25 | ...
 26 | Benchmark done, tensorboard at http://35.173.134.87:6006
 27 | """
 28 | 
 29 | import argparse
 30 | import json
 31 | import os
 32 | import tensorflow as tf
 33 | import time
 34 | 
 35 | parser = argparse.ArgumentParser()
 36 | parser.add_argument('--role', default='launcher', type=str)
 37 | parser.add_argument("--iters", default=20, help="Maximum number of additions")
 38 | parser.add_argument("--data-mb", default=128, help="size of vector in MBs")
 39 | parser.add_argument("--sender-ip", default='127.0.0.1')
 40 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 41 | parser.add_argument("--logdir", help='logging directory')
 42 | parser.add_argument("--aws", action='store_true')
 43 | parser.add_argument('--image', default='Deep Learning AMI (Amazon Linux) Version 13.0')
 44 | args = parser.parse_args()
 45 | 
 46 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 47 |                 'receiver': [args.receiver_ip + ':32301']}
 48 | 
 49 | 
 50 | def _launch_server(role):
 51 |   os.environ['TF_CONFIG'] = json.dumps(
 52 |     {'cluster': cluster_spec,
 53 |      'task': {'type': role, 'index': 0}})
 54 |   config = tf.estimator.RunConfig()
 55 |   return tf.train.Server(config.cluster_spec,
 56 |                          job_name=config.task_type,
 57 |                          task_index=config.task_id)
 58 | 
 59 | 
 60 | def run_launcher():
 61 |   import ncluster
 62 | 
 63 |   if args.aws:
 64 |     ncluster.set_backend('aws')
 65 |   job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image)
 66 |   job.upload(__file__)
 67 |   this_file = os.path.basename(__file__)
 68 | 
 69 |   sender, receiver = job.tasks
 70 |   if ncluster.get_backend() == 'aws':
 71 |     # on AWS probably are running in DLAMI, switch into TF-enabled env
 72 |     job.run('source activate tensorflow_p36')
 73 | 
 74 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
 75 |   job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True)
 76 |   job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
 77 |   job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True)
 78 |   print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
 79 | 
 80 | 
 81 | def run_receiver():
 82 |   server = _launch_server('receiver')
 83 |   time.sleep(365 * 24 * 3600)
 84 |   del server
 85 | 
 86 | 
 87 | def run_sender():
 88 |   summary_writer = tf.summary.FileWriter(args.logdir)
 89 | 
 90 |   param_size = 250 * 1000 * args.data_mb  # 1MB is 250k integers
 91 |   with tf.device('/job:chief/task:0'):
 92 |     grads = tf.fill([param_size], 1.)
 93 | 
 94 |   with tf.device('/job:receiver/task:0'):
 95 |     params = tf.Variable(tf.ones([param_size]))
 96 |     add_op = params.assign_add(grads).op
 97 | 
 98 |   server = _launch_server('chief')
 99 |   sess = tf.Session(server.target)
100 | 
101 |   sess.run(tf.global_variables_initializer())
102 | 
103 |   for i in range(args.iters):
104 |     start_time = time.time()
105 |     sess.run(add_op)
106 |     elapsed_time = time.time() - start_time
107 |     rate = args.data_mb / elapsed_time
108 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate))
109 |     summary = tf.Summary()
110 |     summary.value.add(tag='time_ms', simple_value=elapsed_time * 1000)
111 |     summary_writer.add_summary(summary, i)
112 | 
113 |   summary_writer.close()
114 | 
115 | 
116 | def main():
117 |   # run local benchmark in launcher and launch service
118 |   if args.role == "launcher":
119 |     run_launcher()
120 |   elif args.role == "sender":
121 |     run_sender()
122 |   elif args.role == "receiver":
123 |     run_receiver()
124 |   else:
125 |     assert False, 'unknown role'
126 | 
127 | 
128 | if __name__ == '__main__':
129 |   main()
130 | 


--------------------------------------------------------------------------------
/ncluster/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from . import aws_backend
 4 | from . import aws_util
 5 | from . import util
 6 | from . import local_backend
 7 | from . import backend  # TODO: remove?
 8 | 
 9 | from .ncluster import get_backend
10 | from .ncluster import set_backend
11 | from .ncluster import running_locally
12 | 
13 | from .ncluster import use_aws
14 | from .ncluster import use_local
15 | 
16 | from .ncluster import make_task
17 | from .ncluster import make_job
18 | from .ncluster import make_run
19 | from .ncluster import get_zone
20 | from .ncluster import get_region
21 | from .ncluster import set_logdir_root
22 | from .ncluster import get_logdir_root
23 | 
24 | 
25 | # set default backend from environment
26 | if 'NCLUSTER_BACKEND' in os.environ:
27 |   set_backend(os.environ['NCLUSTER_BACKEND'])
28 | else:
29 |   set_backend('local')
30 | 
31 | util.install_pdb_handler()  # CTRL+\ drops into pdb
32 | 


--------------------------------------------------------------------------------
/ncluster/aws_backend.py:
--------------------------------------------------------------------------------
   1 | """AWS implementation of backend.py
   2 | 
   3 | Not thread-safe
   4 | """
   5 | import glob
   6 | import os
   7 | import pprint
   8 | import shlex
   9 | import signal
  10 | import stat
  11 | import threading
  12 | import time
  13 | from typing import Tuple, List
  14 | 
  15 | import paramiko
  16 | 
  17 | from ncluster import ncluster_globals
  18 | 
  19 | from . import aws_create_resources as create_lib
  20 | from . import aws_util as u
  21 | from . import backend
  22 | from . import util
  23 | 
  24 | TMPDIR = '/tmp/ncluster'  # location for temp files on launching machine
  25 | AWS_LOCK_FN = '/tmp/aws.lock'  # lock file used to prevent concurrent creation of AWS resources by multiple workers in parallel
  26 | NCLUSTER_DEFAULT_REGION = 'us-east-1'  # used as last resort if no other method set a region
  27 | LOGDIR_ROOT = '/ncluster/runs'
  28 | 
  29 | # some image which is fast to load, to use for quick runs
  30 | GENERIC_SMALL_IMAGE = 'amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2'
  31 | 
  32 | 
  33 | class Task(backend.Task):
  34 |   """AWS task is initialized with an AWS instance and handles initialization,
  35 |   creation of SSH session, shutdown"""
  36 |   last_status: int  # status of last command executed
  37 | 
  38 |   tmux_window_id: int
  39 |   tmux_available_window_ids: List[int]
  40 | 
  41 |   sftp: paramiko.SFTPClient
  42 | 
  43 |   def __init__(self, name, *, instance, install_script='', image_name='',
  44 |                **extra_kwargs):
  45 |     """
  46 |    Initializes Task on top of existing AWS instance. Blocks until instance is ready to execute
  47 |    shell commands.
  48 | 
  49 |     Args:
  50 |       name: task name
  51 |       instance: ec2.Instance object (https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#instance)
  52 |       install_script:
  53 |       image_name: AWS image name
  54 |       **extra_kwargs: unused kwargs (kept for compatibility with other backends)
  55 |     """
  56 |     self._cmd_fn = None
  57 |     self._cmd = None
  58 |     self._status_fn = None  # location of output of last status
  59 |     self.last_status = -1
  60 | 
  61 |     self._can_run = False  # indicates that things needed for .run were created
  62 |     self.initialize_called = False
  63 | 
  64 |     self.name = name
  65 |     self.instance = instance
  66 |     self.install_script = install_script
  67 |     self.extra_kwargs = extra_kwargs
  68 | 
  69 |     self.public_ip = u.get_public_ip(instance)
  70 |     self.ip = u.get_ip(instance)
  71 |     self.sftp = None
  72 |     self._linux_type = 'ubuntu'
  73 | 
  74 |     # heuristic to tell if I'm using Amazon image name
  75 |     # default image has name like 'amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2'
  76 |     if 'amzn' in image_name.lower() or 'amazon' in image_name.lower():
  77 |       self.log('Detected Amazon Linux image')
  78 |       self._linux_type = 'amazon'
  79 |     self.run_counter = 0
  80 | 
  81 |     launch_id = util.random_id()
  82 |     self.local_scratch = f"{TMPDIR}/{name}-{launch_id}"
  83 |     self.remote_scratch = f"{TMPDIR}/{name}-{launch_id}"
  84 | 
  85 |     os.system('mkdir -p ' + self.local_scratch)
  86 | 
  87 |     self._initialized_fn = f'is_initialized'
  88 | 
  89 |     # _current_directory tracks current directory on task machine
  90 |     # used for uploading without specifying absolute path on target machine
  91 |     if self._linux_type == 'ubuntu':
  92 |       #      self._current_directory = '/home/ubuntu'
  93 |       self.ssh_username = 'ubuntu'  # default username on task machine
  94 |     elif self._linux_type == 'amazon':
  95 |       #      self._current_directory = '/home/ec2-user'
  96 |       self.ssh_username = 'ec2-user'
  97 |     self.homedir = '/home/' + self.ssh_username
  98 | 
  99 |     self.ssh_client = u.ssh_to_task(self)
 100 |     self._setup_tmux()
 101 |     self._run_raw('mkdir -p ' + self.remote_scratch)
 102 | 
 103 |     self._can_run = True
 104 | 
 105 |     if self._is_initialized_fn_present():
 106 |       self.log("reusing previous initialized state")
 107 |     else:
 108 |       self.log("running install script")
 109 | 
 110 |       # bin/bash needed to make self-executable or use with UserData
 111 |       self.install_script = '#!/bin/bash\n' + self.install_script
 112 |       self.install_script += f'\necho ok > {self._initialized_fn}\n'
 113 |       self.file_write('install.sh', util.shell_add_echo(self.install_script))
 114 |       self.run('bash -e install.sh')  # fail on errors
 115 |       assert self._is_initialized_fn_present(), f"Install script didn't write to {self._initialized_fn}"
 116 | 
 117 |     self._mount_efs()
 118 |     self.connect_instructions = f"""
 119 |     To connect to {self.name}
 120 | ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no {self.ssh_username}@{self.public_ip}
 121 | tmux a
 122 | """.strip()
 123 |     self.log("Initialize complete")
 124 |     self.log(self.connect_instructions)
 125 | 
 126 |   def _is_initialized_fn_present(self):
 127 |     self.log("Checking for initialization status")
 128 |     try:
 129 |       return 'ok' in self.read(self._initialized_fn)
 130 |     except Exception:
 131 |       return False
 132 | 
 133 |   def _setup_tmux(self):
 134 |     self.log("Setting up tmux")
 135 | 
 136 |     self.tmux_session = self.name.replace('.', '=')
 137 |     self.tmux_window_id = 0
 138 |     self.tmux_available_window_ids = [0]
 139 | 
 140 |     tmux_cmd = [f'tmux set-option -g history-limit 50000 \; ',
 141 |                 f'set-option -g mouse on \; ',
 142 |                 f'new-session -s {self.tmux_session} -n 0 -d']
 143 | 
 144 |     # hack to get around Amazon linux not having tmux
 145 |     if self._linux_type == 'amazon':
 146 |       self._run_raw('sudo yum install tmux -y')
 147 |       del tmux_cmd[1]  # Amazon tmux is really old, no mouse option
 148 | 
 149 |     if not util.is_set("NCLUSTER_NOKILL_TMUX"):
 150 |       self._run_raw(f'tmux kill-session -t {self.tmux_session}',
 151 |                     ignore_errors=True)
 152 |     else:
 153 |       print(
 154 |         "Warning, NCLUSTER_NOKILL_TMUX is on, make sure remote tmux prompt is available or things will hang")
 155 | 
 156 |     self._run_raw(''.join(tmux_cmd))
 157 | 
 158 |     self._can_run = True
 159 | 
 160 |   def _mount_efs(self):
 161 |     self.log("Mounting EFS")
 162 |     region = u.get_region()
 163 |     efs_id = u.get_efs_dict()[u.get_prefix()]
 164 |     dns = f"{efs_id}.efs.{region}.amazonaws.com"
 165 |     self.run('sudo mkdir -p /ncluster')
 166 | 
 167 |     # ignore error on remount (efs already mounted)
 168 |     self.run(
 169 |       f"sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {dns}:/ /ncluster",
 170 |       ignore_errors=True)
 171 | 
 172 |     # sometimes mount command doesn't work, make sure it's really mounted before returning
 173 |     stdout, stderr = self.run_with_output('df')
 174 |     while '/ncluster' not in stdout:
 175 |       sleep_sec = 2
 176 |       util.log(f"EFS not yet mounted, sleeping {sleep_sec} seconds")
 177 |       time.sleep(sleep_sec)
 178 |       self.run(
 179 |         f"sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {dns}:/ /ncluster",
 180 |         ignore_errors=True)
 181 |       stdout, stderr = self.run_with_output('df')
 182 | 
 183 |     self.run('sudo chmod 777 /ncluster')
 184 | 
 185 |     # Hack below may no longer be needed
 186 |     # # make sure chmod is successful, hack to fix occasional permission errors
 187 |     # while 'drwxrwxrwx' not in self.run_and_capture_output('ls -ld /ncluster'):
 188 |     #   print(f"chmod 777 /ncluster didn't take, retrying in {TIMEOUT_SEC}")
 189 |     #   time.sleep(TIMEOUT_SEC)
 190 |     #   self.run('sudo chmod 777 /ncluster')
 191 | 
 192 |     # TODO(y): build a pstree and warn if trying to run something while main tmux bash has a subprocess running
 193 |     # this would ensure that commands being sent are not being swallowed
 194 | 
 195 |   def run(self, cmd, non_blocking=False, ignore_errors=False,
 196 |           max_wait_sec=365 * 24 * 3600,
 197 |           check_interval=0.2):
 198 | 
 199 |     # TODO(y): make _run_with_output_on_failure default, and delete this
 200 |     if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE') or True:
 201 |       # experimental version that captures output and prints it on failure
 202 |       # redirection things break bash commands, so
 203 |       # don't redirect on bash commands like source
 204 |       # TODO(y): remove this, put in this filtering becase I thought it broke
 205 |       # source activate, but now it seems it doesn't
 206 |       if not util.is_bash_builtin(cmd) or True:
 207 |         return self._run_with_output_on_failure(cmd, non_blocking,
 208 |                                                 ignore_errors,
 209 |                                                 max_wait_sec)
 210 |       else:
 211 |         self.log("Found bash built-in, using regular run")
 212 | 
 213 |     if not self._can_run:
 214 |       assert False, "Using .run before initialization finished"
 215 | 
 216 |     if '\n' in cmd:
 217 |       cmds = cmd.split('\n')
 218 |       self.log(
 219 |         f"Running {len(cmds)} commands at once, returning status of last")
 220 |       status = -1
 221 |       for subcmd in cmds:
 222 |         status = self.run(subcmd)
 223 |         self.last_status = status
 224 |       return status
 225 | 
 226 |     cmd = cmd.strip()
 227 |     if cmd.startswith('#'):  # ignore empty/commented out lines
 228 |       return -1
 229 |     self.run_counter += 1
 230 |     self.log("tmux> %s", cmd)
 231 | 
 232 |     self._cmd = cmd
 233 |     self._cmd_fn = f'{self.remote_scratch}/{self.run_counter}.cmd'
 234 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
 235 | 
 236 |     cmd = util.shell_strip_comment(cmd)
 237 |     assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
 238 | 
 239 |     # modify command to dump shell success status into file
 240 |     self.file_write(self._cmd_fn, cmd + '\n')
 241 |     modified_cmd = f'{cmd}; echo $? > {self._status_fn}'
 242 |     modified_cmd = shlex.quote(modified_cmd)
 243 | 
 244 |     tmux_window = self.tmux_session + ':' + str(self.tmux_window_id)
 245 |     tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter'
 246 |     self._run_raw(tmux_cmd, ignore_errors=ignore_errors)
 247 |     if non_blocking:
 248 |       return 0
 249 | 
 250 |     if not self.wait_for_file(self._status_fn, max_wait_sec=30):
 251 |       self.log(f"Retrying waiting for {self._status_fn}")
 252 |     while not self.exists(self._status_fn):
 253 |       self.log(f"Still waiting for {cmd}")
 254 |       self.wait_for_file(self._status_fn, max_wait_sec=30)
 255 |     contents = self.read(self._status_fn)
 256 | 
 257 |     # if empty wait a bit to allow for race condition
 258 |     if len(contents) == 0:
 259 |       time.sleep(check_interval)
 260 |       contents = self.read(self._status_fn)
 261 |     status = int(contents.strip())
 262 |     self.last_status = status
 263 | 
 264 |     if status != 0:
 265 |       if not ignore_errors:
 266 |         raise RuntimeError(f"Command {cmd} returned status {status}")
 267 |       else:
 268 |         self.log(f"Warning: command {cmd} returned status {status}")
 269 | 
 270 |     return status
 271 | 
 272 |   def join(self, ignore_errors=False):
 273 |     """Waits until last executed command completed."""
 274 |     assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it"
 275 |     check_interval = 0.2
 276 |     status_fn = self._status_fn
 277 |     if not self.wait_for_file(status_fn, max_wait_sec=30):
 278 |       self.log(f"Retrying waiting for {status_fn}")
 279 |     while not self.exists(status_fn):
 280 |       self.log(f"Still waiting for {self._cmd}")
 281 |       self.wait_for_file(status_fn, max_wait_sec=30)
 282 |     contents = self.read(status_fn)
 283 | 
 284 |     # if empty wait a bit to allow for race condition
 285 |     if len(contents) == 0:
 286 |       time.sleep(check_interval)
 287 |       contents = self.read(status_fn)
 288 |     status = int(contents.strip())
 289 |     self.last_status = status
 290 | 
 291 |     if status != 0:
 292 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
 293 |       if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE') or True:
 294 |         self.log(
 295 |           f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
 296 |         self.log(f"\n{'*'*80}\nEnd failing output")
 297 |       if not ignore_errors:
 298 |         raise RuntimeError(f"Command {self._cmd} returned status {status}")
 299 |       else:
 300 |         self.log(f"Warning: command {self._cmd} returned status {status}")
 301 | 
 302 |     return status
 303 | 
 304 |   def _run_with_output_on_failure(self, cmd, non_blocking=False,
 305 |                                   ignore_errors=False,
 306 |                                   max_wait_sec=365 * 24 * 3600,
 307 |                                   check_interval=0.2) -> str:
 308 |     """Experimental version of run propagates error messages to client. This command will be default "run" eventually"""
 309 | 
 310 |     if not self._can_run:
 311 |       assert False, "Using .run before initialization finished"
 312 | 
 313 |     if '\n' in cmd:
 314 |       assert False, "Don't support multi-line for run2"
 315 | 
 316 |     cmd = cmd.strip()
 317 |     if cmd.startswith('#'):  # ignore empty/commented out lines
 318 |       return ''
 319 |     self.run_counter += 1
 320 |     self.log("tmux> %s", cmd)
 321 | 
 322 |     self._cmd = cmd
 323 |     self._cmd_fn = f'{self.remote_scratch}/{self.run_counter}.cmd'
 324 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
 325 |     self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out'
 326 | 
 327 |     cmd = util.shell_strip_comment(cmd)
 328 |     assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
 329 | 
 330 |     # modify command to dump shell success status into file
 331 |     self.file_write(self._cmd_fn, cmd + '\n')
 332 | 
 333 |     #    modified_cmd = f'{cmd} > {out_fn} 2>&1; echo $? > {status_fn}'
 334 |     # https://stackoverflow.com/a/692407/419116
 335 |     # $cmd > >(tee -a fn) 2> >(tee -a fn >&2)
 336 | 
 337 |     modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}'
 338 |     modified_cmd = shlex.quote(modified_cmd)
 339 | 
 340 |     start_time = time.time()
 341 |     tmux_window = self.tmux_session + ':' + str(self.tmux_window_id)
 342 |     tmux_cmd = f"tmux send-keys -t {tmux_window} {modified_cmd} Enter"
 343 |     self._run_raw(tmux_cmd, ignore_errors=ignore_errors)
 344 |     if non_blocking:
 345 |       return 0
 346 | 
 347 |     if not self.wait_for_file(self._status_fn, max_wait_sec=60):
 348 |       self.log(f"Retrying waiting for {self._status_fn}")
 349 |     elapsed_time = time.time() - start_time
 350 |     while not self.exists(self._status_fn) and elapsed_time < max_wait_sec:
 351 |       self.log(f"Still waiting for {cmd}")
 352 |       self.wait_for_file(self._status_fn, max_wait_sec=60)
 353 |       elapsed_time = time.time() - start_time
 354 |     contents = self.read(self._status_fn)
 355 | 
 356 |     # if empty wait a bit to allow for race condition
 357 |     if len(contents) == 0:
 358 |       time.sleep(check_interval)
 359 |       contents = self.read(self._status_fn)
 360 |     status = int(contents.strip())
 361 |     self.last_status = status
 362 | 
 363 |     if status != 0:
 364 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
 365 |       self.log(
 366 |         f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
 367 |       self.log(f"\n{'*'*80}\nEnd failing output")
 368 |       if not ignore_errors:
 369 |         raise RuntimeError(f"Command {cmd} returned status {status}")
 370 |       else:
 371 |         self.log(f"Warning: command {cmd} returned status {status}")
 372 | 
 373 |     return self.read(self._out_fn)
 374 | 
 375 |   def _run_raw(self, cmd: str, ignore_errors=False) -> Tuple[str, str]:
 376 |     """Runs given cmd in the task using current SSH session, returns
 377 |     stdout/stderr as strings. Because it blocks until cmd is done, use it for
 378 |     short cmds. Silently ignores failing commands.
 379 | 
 380 |     This is a barebones method to be used during initialization that have
 381 |     minimal dependencies (no tmux)
 382 |     """
 383 |     #    self._log("run_ssh: %s"%(cmd,))
 384 | 
 385 |     stdin, stdout, stderr = u.call_with_retries(self.ssh_client.exec_command,
 386 |                                                 command=cmd, get_pty=True)
 387 |     stdout_str = stdout.read().decode()
 388 |     stderr_str = stderr.read().decode()
 389 |     if stdout.channel.recv_exit_status() != 0:
 390 |       if not ignore_errors:
 391 |         self.log(f"command ({cmd}) failed with --->")
 392 |         self.log("failing stdout: " + stdout_str)
 393 |         self.log("failing stderr: " + stderr_str)
 394 |         assert False, "_run_raw failed (see logs for error)"
 395 | 
 396 |     return stdout_str, stderr_str
 397 | 
 398 |   def upload(self, local_fn: str, remote_fn: str = '',
 399 |              dont_overwrite: bool = False) -> None:
 400 |     """Uploads file to remote instance. If location not specified, dumps it
 401 |     into default directory. If remote location has files or directories with the
 402 |      same name, behavior is undefined."""
 403 | 
 404 |     # support wildcard through glob
 405 |     if '*' in local_fn:
 406 |       for local_subfn in glob.glob(local_fn):
 407 |         self.upload(local_subfn)
 408 |       return
 409 | 
 410 |     if '#' in local_fn:  # hashes also give problems from shell commands
 411 |       self.log("skipping backup file {local_fn}")
 412 |       return
 413 | 
 414 |     if not self.sftp:
 415 |       self.sftp = u.call_with_retries(self.ssh_client.open_sftp,
 416 |                                       'self.ssh_client.open_sftp')
 417 | 
 418 |     def maybe_fix_mode(local_fn_, remote_fn_):
 419 |       """Makes remote file execute for locally executable files"""
 420 |       mode = oct(os.stat(local_fn_)[stat.ST_MODE])[-3:]
 421 |       if '7' in mode:
 422 |         self.log(f"Making {remote_fn_} executable with mode {mode}")
 423 |         # use raw run, in case tmux is unavailable
 424 |         self._run_raw(f"chmod {mode} {remote_fn_}")
 425 | 
 426 |     # augmented SFTP client that can transfer directories, from
 427 |     # https://stackoverflow.com/a/19974994/419116
 428 |     def _put_dir(source, target):
 429 |       """ Uploads the contents of the source directory to the target path."""
 430 | 
 431 |       def _safe_mkdir(path, mode=511, ignore_existing=True):
 432 |         """ Augments mkdir by adding an option to not fail if the folder exists  asdf asdf asdf as"""
 433 |         try:
 434 |           self.sftp.mkdir(path, mode)
 435 |         except IOError:
 436 |           if ignore_existing:
 437 |             pass
 438 |           else:
 439 |             raise
 440 | 
 441 |       assert os.path.isdir(source)
 442 |       _safe_mkdir(target)
 443 | 
 444 |       for item in os.listdir(source):
 445 |         if os.path.isfile(os.path.join(source, item)):
 446 |           self.sftp.put(os.path.join(source, item), os.path.join(target, item))
 447 |           maybe_fix_mode(os.path.join(source, item), os.path.join(target, item))
 448 |         else:
 449 |           _safe_mkdir(f'{target}/{item}')
 450 |           _put_dir(f'{source}/{item}', f'{target}/{item}')
 451 | 
 452 |     if not remote_fn:
 453 |       remote_fn = os.path.basename(local_fn)
 454 | 
 455 |     self.log('uploading ' + local_fn + ' to ' + remote_fn)
 456 |     remote_fn = remote_fn.replace('~', self.homedir)
 457 | 
 458 |     if '/' in remote_fn:
 459 |       remote_dir = os.path.dirname(remote_fn)
 460 |       assert self.exists(
 461 |         remote_dir), f"Remote dir {remote_dir} doesn't exist"
 462 |     if dont_overwrite and self.exists(remote_fn):
 463 |       self.log("Remote file %s exists, skipping" % (remote_fn,))
 464 |       return
 465 | 
 466 |     assert os.path.exists(local_fn), f"{local_fn} not found"
 467 |     if os.path.isdir(local_fn):
 468 |       _put_dir(local_fn, remote_fn)
 469 |     else:
 470 |       assert os.path.isfile(local_fn), "%s is not a file" % (local_fn,)
 471 |       # this crashes with IOError when upload failed
 472 |       if self.exists(remote_fn) and self.isdir(remote_fn):
 473 |         remote_fn = remote_fn + '/' + os.path.basename(local_fn)
 474 |       self.sftp.put(localpath=local_fn, remotepath=remote_fn)
 475 |       maybe_fix_mode(local_fn, remote_fn)
 476 | 
 477 |   def download(self, remote_fn, local_fn=''):
 478 |     self.log("downloading %s" % remote_fn)
 479 |     # sometimes open_sftp fails with Administratively prohibited, do retries
 480 |     # root cause could be too many SSH connections being open
 481 |     # https://unix.stackexchange.com/questions/14160/ssh-tunneling-error-channel-1-open-failed-administratively-prohibited-open
 482 |     if not self.sftp:
 483 |       self.sftp = u.call_with_retries(self.ssh_client.open_sftp,
 484 |                                       'self.ssh_client.open_sftp')
 485 |     if not local_fn:
 486 |       local_fn = os.path.basename(remote_fn)
 487 |       self.log("downloading %s to %s" % (remote_fn, local_fn))
 488 |     self.sftp.get(remote_fn, local_fn)
 489 | 
 490 |   def exists(self, remote_fn):
 491 |     stdout, stderr = self._run_raw('stat ' + remote_fn, ignore_errors=True)
 492 |     return 'No such file' not in stdout
 493 | 
 494 |   def write(self, remote_fn, contents):
 495 |     tmp_fn = self.local_scratch + '/' + str(util.now_micros())
 496 |     open(tmp_fn, 'w').write(contents)
 497 |     self.upload(tmp_fn, remote_fn)
 498 | 
 499 |   def read(self, remote_fn):
 500 |     tmp_fn = self.local_scratch + '/' + str(util.now_micros())
 501 |     self.download(remote_fn, tmp_fn)
 502 |     return open(tmp_fn).read()
 503 | 
 504 |   def isdir(self, remote_fn):
 505 |     stdout, _stderr = self._run_raw('ls -ld ' + remote_fn)
 506 |     return stdout.startswith('d')
 507 | 
 508 |   def switch_window(self, window_id: int):
 509 |     """
 510 |     Switches currently active tmux window for given task. 0 is the default window
 511 |     Args:
 512 |       window_id: integer id of tmux window to use
 513 |     """
 514 | 
 515 |     # windows are numbered sequentially 0, 1, 2, ...
 516 |     # create any missing windows and make them point to the same directory
 517 |     if window_id not in self.tmux_available_window_ids:
 518 |       for i in range(max(self.tmux_available_window_ids) + 1, window_id + 1):
 519 |         self._run_raw(f'tmux new-window -t {self.tmux_session} -d')
 520 |         self.tmux_available_window_ids.append(i)
 521 | 
 522 |     self.tmux_window_id = window_id
 523 | 
 524 | 
 525 |   @property
 526 |   def logdir(self):
 527 |     """Returns logging directory, creating one if necessary. See "Logdir" section
 528 |     of design doc on naming convention"""
 529 | 
 530 |     run_name = ncluster_globals.get_run_for_task(self)
 531 |     logdir = ncluster_globals.get_logdir(run_name)
 532 |     if logdir:
 533 |       return logdir
 534 | 
 535 |     # create logdir. Only single task in a group creates the logdir
 536 |     if ncluster_globals.is_chief(self, run_name):
 537 |       chief = self
 538 |     else:
 539 |       chief = ncluster_globals.get_chief(run_name)
 540 | 
 541 |     chief.setup_logdir()
 542 |     return ncluster_globals.get_logdir(run_name)
 543 | 
 544 |     # release lock
 545 | 
 546 |   def setup_logdir(self):
 547 |     # todo: locking on logdir creation
 548 | 
 549 |     """Create logdir for task/job/run
 550 |     """
 551 |     run_name = ncluster_globals.get_run_for_task(self)
 552 |     self.log("Creating logdir for run " + run_name)
 553 |     logdir_root = ncluster_globals.LOGDIR_ROOT
 554 |     assert logdir_root
 555 | 
 556 |     self.run(f'mkdir -p {logdir_root}')
 557 |     find_command = f'find {logdir_root} -maxdepth 1 -type d'
 558 | 
 559 |     stdout, stderr = self.run_with_output(find_command)
 560 |     logdir = f"{logdir_root}/{run_name}"
 561 | 
 562 |     counter = 0
 563 |     while logdir in stdout:
 564 |       counter += 1
 565 |       new_logdir = f'{logdir_root}/{run_name}.{counter:02d}'
 566 |       self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}')
 567 |       logdir = new_logdir
 568 |     self.run(f'mkdir -p {logdir}')
 569 | 
 570 |     ncluster_globals.set_logdir(run_name, logdir)
 571 |     return logdir
 572 | 
 573 |     # legacy methods
 574 |   def file_exists(self, remote_fn):
 575 |     return self.exists(remote_fn)
 576 | 
 577 |   def file_write(self, *args, **kwargs):
 578 |     return self.write(*args, **kwargs)
 579 | 
 580 |   def file_read(self, remote_fn):
 581 |     return self.read(remote_fn)
 582 | 
 583 | class Job(backend.Job):
 584 |   pass
 585 | 
 586 | 
 587 | class Run(backend.Run):
 588 |   """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter
 589 |   server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and
 590 |   event files.
 591 |   :ivar aws_placement_group_name: somedoc
 592 |   """
 593 |   placement_group: str  # unique identifier to use as placement_group group name
 594 |   jobs: List[Job]
 595 | 
 596 |   def __init__(self, name='', **kwargs):
 597 |     """Creates a run. If install_script is specified, it's used as default
 598 |     install_script for all jobs (can be overridden by Job constructor)"""
 599 | 
 600 |     assert name, "Must specify name for current run"
 601 | 
 602 |     jobs = []
 603 |     self.name = name
 604 |     self.jobs = jobs
 605 |     self.kwargs = kwargs
 606 |     util.log(f"Choosing placement_group for run {name}")
 607 |     self.placement_group = name + '-' + util.random_id()
 608 | 
 609 |   @property
 610 |   def logdir(self):
 611 |     # querying logdir has a side-effect of creation, so do it on chief task
 612 |     chief_task = ncluster_globals.get_chief(self.name)
 613 |     return chief_task.logdir
 614 | 
 615 |   # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods
 616 |   def run(self, *args, **kwargs):
 617 |     """Runs command on every job in the run."""
 618 | 
 619 |     for job in self.jobs:
 620 |       job.run(*args, **kwargs)
 621 | 
 622 |   def run_with_output(self, *args, **kwargs):
 623 |     """Runs command on every first job in the run, returns stdout."""
 624 |     for job in self.jobs:
 625 |       job.run_with_output(*args, **kwargs)
 626 | 
 627 |   def _run_raw(self, *args, **kwargs):
 628 |     """_run_raw on every job in the run."""
 629 |     for job in self.jobs:
 630 |       job._run_raw(*args, **kwargs)
 631 | 
 632 |   def upload(self, *args, **kwargs):
 633 |     """Runs command on every job in the run."""
 634 |     for job in self.jobs:
 635 |       job.upload(*args, **kwargs)
 636 | 
 637 |   def make_job(self, name='', **kwargs):
 638 |     return make_job(name+'.'+self.name, run_name=self.name, **kwargs)
 639 | 
 640 | 
 641 | def make_task(
 642 |         name: str = '',
 643 |         run_name: str = '',
 644 |         install_script: str = '',
 645 |         instance_type: str = '',
 646 |         image_name: str = '',
 647 |         disk_size: int = 0,
 648 |         preemptible=None,
 649 |         logging_task: backend.Task = None,
 650 |         create_resources=True,
 651 |         spot=False
 652 | ) -> Task:
 653 |   """
 654 |   Create task on AWS.
 655 | 
 656 |   Automatically places it in singleton Run/singleton Job objects, see Run/Job/Task hierarchy for details
 657 |   https://docs.google.com/document/d/1Gg4T243cYrDUW1YDCikmqp7fzSQDU3rZxOkJr9ohhs8/edit#heading=h.j4td4oixogib
 658 | 
 659 | 
 660 |   Args:
 661 |     disk_size: default size of root disk, in GBs
 662 |     create_resources: whether this task will handle resource creation
 663 |     name: see ncluster.make_task
 664 |     run_name: see ncluster.make_task
 665 |     install_script: see ncluster.make_task
 666 |     instance_type: instance type to use, defaults to $NCLUSTER_INSTANCE or t3.micro if unset
 667 |     image_name: name of image, ie, "Deep Learning AMI (Ubuntu) Version 12.0", defaults to $NCLUSTER_IMAGE or amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2 if unset
 668 |     preemptible: use cheaper preemptible/spot instances
 669 |     logging_task: partially initialized Task object, use it for logging
 670 | 
 671 |   Returns:
 672 | 
 673 |   """
 674 | 
 675 |   ncluster_globals.task_launched = True
 676 | 
 677 |   def log(*_args):
 678 |     if logging_task:
 679 |       logging_task.log(*_args)
 680 |     else:
 681 |       util.log(*_args)
 682 | 
 683 |   # if name not specified, use name which is the same across script invocations for given image/instance-type
 684 |   name = ncluster_globals.auto_assign_task_name_if_needed(name, instance_type,
 685 |                                                           image_name)
 686 | 
 687 |   if not instance_type:
 688 |     instance_type = os.environ.get('NCLUSTER_INSTANCE', 't3.micro')
 689 |     log("Using instance " + instance_type)
 690 | 
 691 |   _set_aws_environment()
 692 |   if create_resources:
 693 |     _maybe_create_resources(logging_task=logging_task)
 694 |   else:
 695 |     pass
 696 | 
 697 |   run: Run = ncluster_globals.get_run_object(run_name)
 698 |   placement_group = ''
 699 |   if u.instance_supports_placement_groups(instance_type) and run:
 700 |     placement_group = run.placement_group
 701 |     log(f"Launching into placement_group group {placement_group}")
 702 |     u.maybe_create_placement_group(run.placement_group)
 703 | 
 704 |   if not image_name:
 705 |     image_name = os.environ.get('NCLUSTER_IMAGE',
 706 |                                 GENERIC_SMALL_IMAGE)
 707 |   log("Using image " + image_name)
 708 | 
 709 |   if preemptible is None:
 710 |     preemptible = os.environ.get('NCLUSTER_PREEMPTIBLE', False)
 711 |     preemptible = bool(preemptible)
 712 |     if preemptible:
 713 |       log("Using preemptible instances")
 714 | 
 715 |   image = u.lookup_image(image_name)
 716 |   keypair = u.get_keypair()
 717 |   security_group = u.get_security_group()
 718 |   ec2 = u.get_ec2_resource()
 719 | 
 720 |   instance = u.lookup_instance(name, instance_type,
 721 |                                image_name)
 722 |   _maybe_start_instance(instance)
 723 |   _maybe_wait_for_initializing_instance(instance)
 724 | 
 725 |   # create the instance if not present
 726 |   if instance:
 727 |     log(f"Reusing {instance}")
 728 |   else:
 729 |     log(f"Allocating {instance_type} for task {name}")
 730 |     args = {'ImageId': image.id,
 731 |             'InstanceType': instance_type,
 732 |             'MinCount': 1,
 733 |             'MaxCount': 1,
 734 |             'SecurityGroupIds': [security_group.id],
 735 |             'KeyName': keypair.name}
 736 | 
 737 |     args['TagSpecifications'] = [{
 738 |       'ResourceType': 'instance',
 739 |       'Tags': [{
 740 |         'Key': 'Name',
 741 |         'Value': name
 742 |       }]
 743 |     }]
 744 | 
 745 |     #    subnet = u.get_subnet()
 746 |     #    args['NetworkInterfaces'] = [{'SubnetId': subnet.id,
 747 |     #                                  'DeviceIndex': 0,
 748 |     #                                  'AssociatePublicIpAddress': True,
 749 |     #                                  'Groups': [security_group.id]}]
 750 |     #    placement_specs = {'AvailabilityZone': u.get_zone()}
 751 | 
 752 |     placement_specs = {}
 753 |     if placement_group:
 754 |       placement_specs['GroupName'] = placement_group
 755 | 
 756 |     args['Placement'] = placement_specs
 757 |     args['Monitoring'] = {'Enabled': True}
 758 | 
 759 |     if disk_size:
 760 |       assert disk_size > 0
 761 |       ebs = {
 762 |         'VolumeSize': disk_size,
 763 |         'VolumeType': 'gp2',
 764 |       }
 765 | 
 766 |       args['BlockDeviceMappings'] = [{
 767 |         'DeviceName': '/dev/sda1',
 768 |         'Ebs': ebs
 769 |       }]
 770 | 
 771 |     # Use high throughput disk (0.065/iops-month = about $1/hour)
 772 |     if 'NCLUSTER_AWS_FAST_ROOTDISK' in os.environ:
 773 |       assert not disk_size, f"Specified both disk_size {disk_size} and $NCLUSTER_AWS_FAST_ROOTDISK, they are incompatible as $NCLUSTER_AWS_FAST_ROOTDISK hardwired disk size"
 774 | 
 775 |       ebs = {
 776 |         'VolumeSize': 500,
 777 |         'VolumeType': 'io1',
 778 |         'Iops': 11500
 779 |       }
 780 | 
 781 |       args['BlockDeviceMappings'] = [{
 782 |         'DeviceName': '/dev/sda1',
 783 |         'Ebs': ebs
 784 |       }]
 785 | 
 786 |     instances = []
 787 |     try:
 788 |       if spot:
 789 |         instances = u.create_spot_instances(args)
 790 |       else:
 791 |         instances = ec2.create_instances(**args)
 792 |     except Exception as e:
 793 |       log(f"Instance creation for {name} failed with ({e})")
 794 |       log(
 795 |         "You can change availability zone using export NCLUSTER_ZONE=...")
 796 |       log("Terminating")
 797 |       os.kill(os.getpid(),
 798 |               signal.SIGINT)  # sys.exit() doesn't work inside thread
 799 | 
 800 |     assert instances, f"ec2.create_instances returned {instances}"
 801 |     log(f"Allocated {len(instances)} instances")
 802 |     instance = instances[0]
 803 | 
 804 |   task = Task(name, instance=instance,
 805 |               install_script=install_script,
 806 |               image_name=image_name,
 807 |               instance_type=instance_type)
 808 | 
 809 |   ncluster_globals.register_task(task, run_name)
 810 |   return task
 811 | 
 812 | 
 813 | def make_job(
 814 |         name: str = '',
 815 |         run_name: str = '',
 816 |         num_tasks: int = 1,
 817 |         install_script: str = '',
 818 |         instance_type: str = '',
 819 |         image_name: str = '',
 820 |         create_resources=True,
 821 |         **kwargs) -> Job:
 822 |   """
 823 |   Args:
 824 |     create_resources: if True, will create resources if necessary
 825 |     name: see backend.make_task
 826 |     run_name: see backend.make_task
 827 |     num_tasks: number of tasks to launch
 828 |     install_script: see make_task
 829 |     instance_type: see make_task
 830 |     image_name: see make_task
 831 | 
 832 |   Returns:
 833 | 
 834 |   """
 835 |   assert num_tasks > 0, f"Can't create job with {num_tasks} tasks"
 836 |   assert name.count(
 837 |     '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for  convention)"
 838 | 
 839 |   # dummy tasks for logging
 840 |   tasks = [backend.Task(f"{i}.{name}") for i in range(num_tasks)]
 841 | 
 842 |   _set_aws_environment(tasks[0])
 843 |   if create_resources:
 844 |     _maybe_create_resources(tasks[0])
 845 | 
 846 |   name = ncluster_globals.auto_assign_job_name_if_needed(name)
 847 |   run_name = ncluster_globals.auto_assign_run_name_if_needed(run_name)
 848 |   _run = ncluster_globals.create_run_if_needed(run_name, make_run)
 849 | 
 850 |   job = Job(name=name, tasks=tasks, run_name=run_name, **kwargs)
 851 | 
 852 |   exceptions = []
 853 | 
 854 |   # make tasks in parallel
 855 |   def make_task_fn(i: int):
 856 |     try:
 857 |       tasks[i] = make_task(f"{i}.{name}", run_name=run_name,
 858 |                            install_script=install_script,
 859 |                            instance_type=instance_type, image_name=image_name,
 860 |                            logging_task=tasks[i],
 861 |                            create_resources=False,
 862 |                            # handle resources in job already
 863 |                            **kwargs)
 864 |     except Exception as e:
 865 |       exceptions.append(e)
 866 | 
 867 |   util.log("Creating threads")
 868 |   threads = [threading.Thread(name=f'make_task_{i}',
 869 |                               target=make_task_fn, args=[i])
 870 |              for i in range(num_tasks)]
 871 |   for thread in threads:
 872 |     thread.start()
 873 |   for thread in threads:
 874 |     thread.join()
 875 |   print("Exception are ", exceptions)
 876 |   if exceptions:
 877 |     raise exceptions[0]
 878 | 
 879 |   job.tasks = tasks
 880 | 
 881 |   # double check that all instances are in the same placement_group group
 882 |   # this can happen if some instances from previous smaller run are getting reused
 883 |   placement_dict = {task.instance.placement_group: task.name for task in
 884 |                     job.tasks}
 885 |   # TODO: make placement_group group name derived from run, to make it deterministic
 886 |   # on individual instance restarts
 887 |   if len(placement_dict) > 1:
 888 |     util.log("Job tasks are spread over multiple placement_group groups")
 889 |     pprint.pprint(placement_dict)
 890 |     raise RuntimeError(
 891 |       f"Got instance spread over multiple placement_group groups: {placement_dict}. Must terminate all instances in run {run_name} and try again.")
 892 |   return job
 893 | 
 894 | 
 895 | def make_run(name) -> Run:
 896 |   run = Run(name)
 897 |   ncluster_globals.register_run(name, run)
 898 |   return run
 899 | 
 900 | 
 901 | # TODO: this method and a few others are backend specific, document in API doc
 902 | def _maybe_start_instance(instance):
 903 |   """Starts instance if it's stopped, no-op otherwise."""
 904 | 
 905 |   if not instance:
 906 |     return
 907 | 
 908 |   if instance.state['Name'] == 'stopped':
 909 |     instance.start()
 910 |     while True:
 911 |       print(f"Waiting  for {instance} to start.")
 912 |       instance.reload()
 913 |       if instance.state['Name'] == 'running':
 914 |         break
 915 |       time.sleep(10)
 916 | 
 917 | 
 918 | def _maybe_wait_for_initializing_instance(instance):
 919 |   """Starts instance if it's stopped, no-op otherwise."""
 920 | 
 921 |   if not instance:
 922 |     return
 923 | 
 924 |   if instance.state['Name'] == 'initializing':
 925 |     while True:
 926 |       print(f"Waiting  for {instance} to leave state 'initializing'.")
 927 |       instance.reload()
 928 |       if instance.state['Name'] == 'running':
 929 |         break
 930 |       time.sleep(10)
 931 | 
 932 | 
 933 | def _maybe_create_resources(logging_task: Task = None):
 934 |   """Use heuristics to decide to possibly create resources"""
 935 | 
 936 |   def log(*args):
 937 |     if logging_task:
 938 |       logging_task.log(*args)
 939 |     else:
 940 |       util.log(*args)
 941 | 
 942 |   def should_create_resources():
 943 |     """Check if gateway, keypair, vpc exist."""
 944 |     prefix = u.get_prefix()
 945 |     if u.get_keypair_name() not in u.get_keypair_dict():
 946 |       log(f"Missing {u.get_keypair_name()} keypair, creating resources")
 947 |       return True
 948 |     vpcs = u.get_vpc_dict()
 949 |     if prefix not in vpcs:
 950 |       log(f"Missing {prefix} vpc, creating resources")
 951 |       return True
 952 |     vpc = vpcs[prefix]
 953 |     gateways = u.get_gateway_dict(vpc)
 954 |     if prefix not in gateways:
 955 |       log(f"Missing {prefix} gateway, creating resources")
 956 |       return True
 957 |     return False
 958 | 
 959 |   try:
 960 |     # this locking is approximate, still possible for threads to slip through
 961 |     if os.path.exists(AWS_LOCK_FN):
 962 |       pid, ts, lock_taskname = open(AWS_LOCK_FN).read().split('-')
 963 |       ts = int(ts)
 964 |       log(f"waiting for aws resource creation, another resource initiation was "
 965 |           f"initiated {int(time.time()-ts)} seconds ago by "
 966 |           f"{lock_taskname}, delete lock file "
 967 |           f"{AWS_LOCK_FN} if this is an error")
 968 |       while True:
 969 |         if os.path.exists(AWS_LOCK_FN):
 970 |           log(f"waiting for lock file {AWS_LOCK_FN} to get deleted "
 971 |               f"initiated {int(time.time()-ts)} seconds ago by ")
 972 |           time.sleep(2)
 973 |           continue
 974 |         else:
 975 |           break
 976 |       return
 977 | 
 978 |     with open(AWS_LOCK_FN, 'w') as f:
 979 |       f.write(
 980 |         f'{os.getpid()}-{int(time.time())}-{logging_task.name if logging_task else ""}')
 981 | 
 982 |     if not should_create_resources():
 983 |       util.log("Resources already created, no-op")
 984 |       os.remove(AWS_LOCK_FN)
 985 |       return
 986 | 
 987 |     create_lib.create_resources()
 988 |   finally:
 989 |     if os.path.exists(AWS_LOCK_FN):
 990 |       os.remove(AWS_LOCK_FN)
 991 | 
 992 | 
 993 | def _set_aws_environment(task: Task = None):
 994 |   """Sets up AWS environment from NCLUSTER environment variables"""
 995 |   current_zone = os.environ.get('NCLUSTER_ZONE', '')
 996 |   current_region = os.environ.get('AWS_DEFAULT_REGION', '')
 997 | 
 998 |   def log(*args):
 999 |     if task:
1000 |       task.log(*args)
1001 |     else:
1002 |       util.log(*args)
1003 | 
1004 |   if current_region and current_zone:
1005 |     assert current_zone.startswith(
1006 |       current_region), f'Current zone "{current_zone}" ($NCLUSTER_ZONE) is not ' \
1007 |                        f'in current region "{current_region} ($AWS_DEFAULT_REGION)'
1008 |     assert u.get_session().region_name == current_region  # setting from ~/.aws
1009 | 
1010 |   # zone is set, set region from zone
1011 |   if current_zone and not current_region:
1012 |     current_region = current_zone[:-1]
1013 |     os.environ['AWS_DEFAULT_REGION'] = current_region
1014 | 
1015 |   # neither zone nor region not set, use default setting for region
1016 |   # if default is not set, use NCLUSTER_DEFAULT_REGION
1017 |   if not current_region:
1018 |     current_region = u.get_session().region_name
1019 |     if not current_region:
1020 |       log(f"No default region available, using {NCLUSTER_DEFAULT_REGION}")
1021 |       current_region = NCLUSTER_DEFAULT_REGION
1022 |     os.environ['AWS_DEFAULT_REGION'] = current_region
1023 | 
1024 |   # zone not set, use first zone of the region
1025 |   #  if not current_zone:
1026 |   #    current_zone = current_region + 'a'
1027 |   #    os.environ['NCLUSTER_ZONE'] = current_zone
1028 | 
1029 |   log(f"Using account {u.get_account_number()}, region {current_region}, "
1030 |       f"zone {current_zone}")
1031 | 
1032 | 


--------------------------------------------------------------------------------
/ncluster/aws_create_resources.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Creates resources
  4 | # This script creates VPC/security group/keypair if not already present
  5 | 
  6 | import os
  7 | import sys
  8 | import time
  9 | 
 10 | from ncluster import aws_util as u
 11 | from ncluster import util
 12 | 
 13 | DRYRUN = False
 14 | DEBUG = True
 15 | 
 16 | # Names of Amazon resources that are created. These settings are fixed across
 17 | # all runs, and correspond to resources created once per user per region.
 18 | 
 19 | PUBLIC_TCP_RANGES = [
 20 |   22,  # ssh
 21 |   # ipython notebook ports
 22 |   (8888, 8899),
 23 |   # redis port
 24 |   6379,
 25 |   # tensorboard ports
 26 |   (6006, 6016)
 27 | ]
 28 | 
 29 | PUBLIC_UDP_RANGES = [(60000, 61000)]  # mosh ports
 30 | 
 31 | 
 32 | # TODO: this creates a custom VPC, but we are using default VPC, so have two security groups
 33 | # once we are sure we don't need custom VPC, can get rid of extra VPC creation
 34 | 
 35 | def network_setup():
 36 |   """Creates VPC if it doesn't already exists, configures it for public
 37 |   internet access, returns vpc, subnet, security_group"""
 38 | 
 39 |   # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6
 40 | 
 41 |   ec2 = u.get_ec2_resource()
 42 |   client = u.get_ec2_client()
 43 |   existing_vpcs = u.get_vpc_dict()
 44 |   zones = u.get_zones()
 45 | 
 46 |   # create VPC from scratch. Remove this if default VPC works well enough.
 47 |   vpc_name = u.get_vpc_name()
 48 |   if u.get_vpc_name() in existing_vpcs:
 49 |     print("Reusing VPC " + vpc_name)
 50 |     vpc = existing_vpcs[vpc_name]
 51 |     subnets = list(vpc.subnets.all())
 52 |     assert len(subnets) == len(
 53 |       zones), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % (
 54 |       len(subnets), len(zones))
 55 | 
 56 |   else:
 57 |     print("Creating VPC " + vpc_name)
 58 |     vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16')
 59 | 
 60 |     # enable DNS on the VPC
 61 |     response = vpc.modify_attribute(EnableDnsHostnames={"Value": True})
 62 |     assert u.is_good_response(response)
 63 |     response = vpc.modify_attribute(EnableDnsSupport={"Value": True})
 64 |     assert u.is_good_response(response)
 65 | 
 66 |     vpc.create_tags(Tags=u.create_name_tags(vpc_name))
 67 |     vpc.wait_until_available()
 68 | 
 69 |   gateways = u.get_gateway_dict(vpc)
 70 |   gateway_name = u.get_gateway_name()
 71 |   if gateway_name in gateways:
 72 |     print("Reusing gateways " + gateway_name)
 73 |   else:
 74 |     print("Creating internet gateway " + gateway_name)
 75 |     ig = ec2.create_internet_gateway()
 76 |     ig.attach_to_vpc(VpcId=vpc.id)
 77 |     ig.create_tags(Tags=u.create_name_tags(gateway_name))
 78 | 
 79 |     # check that attachment succeeded
 80 |     attach_state = u.extract_attr_for_match(ig.attachments, State=-1,
 81 |                                             VpcId=vpc.id)
 82 |     assert attach_state == 'available', "vpc %s is in state %s" % (vpc.id,
 83 |                                                                    attach_state)
 84 | 
 85 |     route_table = vpc.create_route_table()
 86 |     route_table_name = u.get_route_table_name()
 87 |     route_table.create_tags(Tags=u.create_name_tags(route_table_name))
 88 | 
 89 |     dest_cidr = '0.0.0.0/0'
 90 |     route_table.create_route(
 91 |       DestinationCidrBlock=dest_cidr,
 92 |       GatewayId=ig.id
 93 |     )
 94 |     # check success
 95 |     for route in route_table.routes:
 96 |       # result looks like this
 97 |       # ec2.Route(route_table_id='rtb-a8b438cf',
 98 |       #    destination_cidr_block='0.0.0.0/0')
 99 |       if route.destination_cidr_block == dest_cidr:
100 |         break
101 |     else:
102 |       # sometimes get
103 |       #      AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')]
104 |       # TODO: add a wait/retry?
105 |       assert False, "Route for %s not found in %s" % (dest_cidr,
106 |                                                       route_table.routes)
107 | 
108 |     assert len(zones) <= 16  # for cidr/20 to fit into cidr/16
109 |     ip = 0
110 |     for zone in zones:
111 |       cidr_block = '192.168.%d.0/20' % (ip,)
112 |       ip += 16
113 |       print("Creating subnet %s in zone %s" % (cidr_block, zone))
114 |       subnet = vpc.create_subnet(CidrBlock=cidr_block,
115 |                                  AvailabilityZone=zone)
116 |       subnet.create_tags(Tags=[{'Key': 'Name', 'Value': f'{vpc_name}-subnet'},
117 |                                {'Key': 'Region', 'Value': zone}])
118 |       response = client.modify_subnet_attribute(
119 |         MapPublicIpOnLaunch={'Value': True},
120 |         SubnetId=subnet.id
121 |       )
122 |       assert u.is_good_response(response)
123 |       u.wait_until_available(subnet)
124 |       assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?"
125 | 
126 |       route_table.associate_with_subnet(SubnetId=subnet.id)
127 | 
128 |   # Use default VPC from now on
129 |   vpc = u.get_default_vpc()
130 |   if not vpc:
131 |     util.log(f"Creating default VPC for region {u.get_region()}")
132 |     client.create_default_vpc()
133 |   vpc = u.get_default_vpc()
134 |   assert vpc, "Could not create default VPC?"
135 | 
136 |   existing_security_groups = u.get_security_group_dict()
137 |   security_group_name = u.get_security_group_name()
138 |   if security_group_name in existing_security_groups:
139 |     print("Reusing security group " + security_group_name)
140 |     security_group = existing_security_groups[security_group_name]
141 |     assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \
142 |                                             f"attached to {security_group.vpc_id} but expected {vpc.id}"
143 |   else:
144 |     print("Creating security group " + security_group_name)
145 |     security_group = ec2.create_security_group(
146 |       GroupName=security_group_name, Description=security_group_name,
147 |       VpcId=vpc.id)
148 | 
149 |     security_group.create_tags(Tags=u.create_name_tags(security_group_name))
150 | 
151 |     # allow ICMP access for public ping
152 |     security_group.authorize_ingress(
153 |       CidrIp='0.0.0.0/0',
154 |       IpProtocol='icmp',
155 |       FromPort=-1,
156 |       ToPort=-1
157 |     )
158 | 
159 |     # open public ports
160 |     # always include SSH port which is required for basic functionality
161 |     assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access"
162 |     for port in PUBLIC_TCP_RANGES:
163 |       if util.is_iterable(port):
164 |         assert len(port) == 2
165 |         from_port, to_port = port
166 |       else:
167 |         from_port, to_port = port, port
168 | 
169 |       response = security_group.authorize_ingress(IpProtocol="tcp",
170 |                                                   CidrIp="0.0.0.0/0",
171 |                                                   FromPort=from_port,
172 |                                                   ToPort=to_port)
173 |       assert u.is_good_response(response)
174 | 
175 |     for port in PUBLIC_UDP_RANGES:
176 |       if util.is_iterable(port):
177 |         assert len(port) == 2
178 |         from_port, to_port = port
179 |       else:
180 |         from_port, to_port = port, port
181 | 
182 |       response = security_group.authorize_ingress(IpProtocol="udp",
183 |                                                   CidrIp="0.0.0.0/0",
184 |                                                   FromPort=from_port,
185 |                                                   ToPort=to_port)
186 |       assert u.is_good_response(response)
187 | 
188 |     # allow ingress within security group
189 |     # Authorizing ingress doesn't work with names in a non-default VPC,
190 |     # so must use more complicated syntax
191 |     # https://github.com/boto/boto3/issues/158
192 |     response = {}
193 |     for protocol in ['icmp']:
194 |       try:
195 |         rule = {'FromPort': -1,
196 |                 'IpProtocol': protocol,
197 |                 'IpRanges': [],
198 |                 'PrefixListIds': [],
199 |                 'ToPort': -1,
200 |                 'UserIdGroupPairs': [{'GroupId': security_group.id}]}
201 |         response = security_group.authorize_ingress(IpPermissions=[rule])
202 |       except Exception as e:
203 |         if response['Error']['Code'] == 'InvalidPermission.Duplicate':
204 |           print("Warning, got " + str(e))
205 |         else:
206 |           assert False, "Failed while authorizing ingress with " + str(e)
207 | 
208 |     for protocol in ['tcp', 'udp']:
209 |       try:
210 |         rule = {'FromPort': 0,
211 |                 'IpProtocol': protocol,
212 |                 'IpRanges': [],
213 |                 'PrefixListIds': [],
214 |                 'ToPort': 65535,
215 |                 'UserIdGroupPairs': [{'GroupId': security_group.id}]}
216 |         response = security_group.authorize_ingress(IpPermissions=[rule])
217 |       except Exception as e:
218 |         if response['Error']['Code'] == 'InvalidPermission.Duplicate':
219 |           print("Warning, got " + str(e))
220 |         else:
221 |           assert False, "Failed while authorizing ingress with " + str(e)
222 | 
223 |   return vpc, security_group
224 | 
225 | 
226 | def keypair_setup():
227 |   """Creates keypair if necessary, saves private key locally, returns contents
228 |   of private key file."""
229 | 
230 |   os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION)
231 | 
232 |   keypair_name = u.get_keypair_name()
233 |   keypair = u.get_keypair_dict().get(keypair_name, None)
234 |   keypair_fn = u.get_keypair_fn()
235 |   if keypair:
236 |     print("Reusing keypair " + keypair_name)
237 |     # check that local pem file exists and is readable
238 |     assert os.path.exists(
239 |       keypair_fn), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % (
240 |       keypair_name, keypair_fn, keypair_name)
241 |     keypair_contents = open(keypair_fn).read()
242 |     assert len(keypair_contents) > 0
243 |   else:
244 |     print("Creating keypair " + keypair_name)
245 |     ec2 = u.get_ec2_resource()
246 |     assert not os.path.exists(
247 |       keypair_fn), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % (
248 |       keypair_fn)
249 |     keypair = ec2.create_key_pair(KeyName=keypair_name)
250 | 
251 |     open(keypair_fn, 'w').write(keypair.key_material)
252 |     os.system('chmod 400 ' + keypair_fn)
253 | 
254 |   return keypair
255 | 
256 | 
257 | def placement_group_setup(group_name):
258 |   """Creates placement_group group if necessary. Returns True if new placement_group
259 |   group was created, False otherwise."""
260 | 
261 |   existing_placement_groups = u.get_placement_group_dict()
262 | 
263 |   group = existing_placement_groups.get(group_name, None)
264 |   if group:
265 |     assert group.state == 'available'
266 |     assert group.strategy == 'cluster'
267 |     print("Reusing group ", group.name)
268 |     return group
269 | 
270 |   print("Creating group " + group_name)
271 |   ec2 = u.get_ec2_resource()
272 |   group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster')
273 |   return group
274 | 
275 | 
276 | def create_resources():
277 |   print(f"Creating {u.get_prefix()} resources in region {u.get_region()}")
278 | 
279 |   vpc, security_group = network_setup()
280 |   keypair_setup()  # saves private key locally to keypair_fn
281 | 
282 |   # create EFS
283 |   efss = u.get_efs_dict()
284 |   efs_name = u.get_efs_name()
285 |   efs_id = efss.get(efs_name, '')
286 |   if not efs_id:
287 |     print("Creating EFS " + efs_name)
288 |     efs_id = u.create_efs(efs_name)
289 |   else:
290 |     print("Reusing EFS " + efs_name)
291 | 
292 |   efs_client = u.get_efs_client()
293 | 
294 |   # create mount target for each subnet in the VPC
295 | 
296 |   # added retries because efs is not immediately available
297 |   max_failures = 10
298 |   retry_interval_sec = 1
299 |   for subnet in vpc.subnets.all():
300 |     for retry_attempt in range(max_failures):
301 |       try:
302 |         sys.stdout.write(
303 |           "Creating efs mount target for %s ... " % (subnet.availability_zone,))
304 |         sys.stdout.flush()
305 |         response = efs_client.create_mount_target(FileSystemId=efs_id,
306 |                                                   SubnetId=subnet.id,
307 |                                                   SecurityGroups=[
308 |                                                     security_group.id])
309 |         if u.is_good_response(response):
310 |           print("success")
311 |           break
312 |       except Exception as e:
313 |         if 'already exists' in str(e):  # ignore "already exists" errors
314 |           print('already exists')
315 |           break
316 | 
317 |         # Takes couple of seconds for EFS to come online, with
318 |         # errors like this:
319 |         # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec
320 | 
321 |         print("Got %s, retrying in %s sec" % (str(e), retry_interval_sec))
322 |         time.sleep(retry_interval_sec)
323 |     else:
324 |       print("Giving up.")
325 | 
326 | 
327 | if __name__ == '__main__':
328 |   create_resources()
329 | 


--------------------------------------------------------------------------------
/ncluster/aws_delete_resources.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Deletes resources
  4 | 
  5 | import sys
  6 | import os
  7 | import argparse
  8 | 
  9 | from ncluster import aws_util as u
 10 | from ncluster import util
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--kind', type=str, default='all',
 14 |                     help="which resources to delete, all/network/keypair/efs")
 15 | parser.add_argument('--force-delete-efs', action='store_true',
 16 |                     help="force deleting main EFS")
 17 | args = parser.parse_args()
 18 | 
 19 | EFS_NAME = u.get_prefix()
 20 | VPC_NAME = u.get_prefix()
 21 | SECURITY_GROUP_NAME = u.get_prefix()
 22 | ROUTE_TABLE_NAME = u.get_prefix()
 23 | KEYPAIR_NAME = u.get_keypair_name()
 24 | 
 25 | client = u.get_ec2_client()
 26 | ec2 = u.get_ec2_resource()
 27 | 
 28 | 
 29 | def response_type(response):
 30 |   return 'ok' if u.is_good_response(response) else 'failed'
 31 | 
 32 | 
 33 | def delete_efs():
 34 |   efss = u.get_efs_dict()
 35 |   efs_id = efss.get(EFS_NAME, '')
 36 |   efs_client = u.get_efs_client()
 37 |   if efs_id:
 38 |     try:
 39 |       # delete mount targets first
 40 |       print("About to delete %s (%s)" % (efs_id, EFS_NAME))
 41 |       response = efs_client.describe_mount_targets(FileSystemId=efs_id)
 42 |       assert u.is_good_response(response)
 43 |       for mount_response in response['MountTargets']:
 44 |         id_ = mount_response['MountTargetId']
 45 |         sys.stdout.write('Deleting mount target %s ... ' % (id_,))
 46 |         sys.stdout.flush()
 47 |         response = efs_client.delete_mount_target(MountTargetId=id_)
 48 |         print(response_type(response))
 49 | 
 50 |       sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, EFS_NAME))
 51 |       sys.stdout.flush()
 52 |       u.delete_efs_by_id(efs_id)
 53 | 
 54 |     except Exception as e:
 55 |       sys.stdout.write(f'failed with {e}\n')
 56 |       util.log_error(str(e) + '\n')
 57 | 
 58 | 
 59 | def delete_network():
 60 |   existing_vpcs = u.get_vpc_dict()
 61 |   if VPC_NAME in existing_vpcs:
 62 |     vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id)
 63 |     print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id))
 64 | 
 65 |     for subnet in vpc.subnets.all():
 66 |       try:
 67 |         sys.stdout.write("Deleting subnet %s ... " % subnet.id)
 68 |         sys.stdout.write(response_type(subnet.delete()) + '\n')
 69 |       except Exception as e:
 70 |         sys.stdout.write('failed\n')
 71 |         util.log_error(str(e) + '\n')
 72 | 
 73 |     for gateway in vpc.internet_gateways.all():
 74 |       sys.stdout.write("Deleting gateway %s ... " % gateway.id)
 75 |       # note: if instances are using VPC, this fails with
 76 |       # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway.
 77 | 
 78 |       sys.stdout.write('detached ... ' if u.is_good_response(
 79 |         gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ')
 80 |       sys.stdout.write('deleted ' if u.is_good_response(
 81 |         gateway.delete()) else ' delete_failed ')
 82 |       sys.stdout.write('\n')
 83 | 
 84 |     def desc():
 85 |       return "%s (%s)" % (route_table.id, u.get_name(route_table.tags))
 86 | 
 87 |     for route_table in vpc.route_tables.all():
 88 |       sys.stdout.write(f"Deleting route table {desc()} ... ")
 89 |       try:
 90 |         sys.stdout.write(response_type(route_table.delete()) + '\n')
 91 |       except Exception as e:
 92 |         sys.stdout.write('failed\n')
 93 |         util.log_error(str(e) + '\n')
 94 | 
 95 |     def desc():
 96 |       return "%s (%s, %s)" % (
 97 |         security_group.id, u.get_name(security_group.tags),
 98 |         security_group.group_name)
 99 | 
100 |     for security_group in vpc.security_groups.all():
101 |       # default group is undeletable, skip
102 |       if security_group.group_name == 'default':
103 |         continue
104 |       sys.stdout.write(
105 |         'Deleting security group %s ... ' % (desc()))
106 |       try:
107 |         sys.stdout.write(response_type(security_group.delete()) + '\n')
108 |       except Exception as e:
109 |         sys.stdout.write('failed\n')
110 |         util.log_error(str(e) + '\n')
111 | 
112 |     sys.stdout.write("Deleting VPC %s ... " % vpc.id)
113 |     try:
114 |       sys.stdout.write(response_type(vpc.delete()) + '\n')
115 |     except Exception as e:
116 |       sys.stdout.write('failed\n')
117 |       util.log_error(str(e) + '\n')
118 | 
119 | 
120 | def delete_keypair():
121 |   keypairs = u.get_keypair_dict()
122 |   keypair = keypairs.get(KEYPAIR_NAME, '')
123 |   if keypair:
124 |     try:
125 |       sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name,
126 |                                                           KEYPAIR_NAME))
127 |       sys.stdout.write(response_type(keypair.delete()) + '\n')
128 |     except Exception as e:
129 |       sys.stdout.write('failed\n')
130 |       util.log_error(str(e) + '\n')
131 | 
132 |   keypair_fn = u.get_keypair_fn()
133 |   if os.path.exists(keypair_fn):
134 |     print("Deleting local keypair file %s" % (keypair_fn,))
135 |     os.system('rm -f ' + keypair_fn)
136 | 
137 | 
138 | def delete_resources(force_delete_efs=False):
139 |   region = os.environ['AWS_DEFAULT_REGION']
140 | 
141 |   resource = u.get_prefix()
142 |   print(f"Deleting {resource} resources in region {region}")
143 |   print(f"Make sure {resource} instances are terminated or this will fail.")
144 | 
145 |   if 'efs' in args.kind or 'all' in args.kind:
146 |     if EFS_NAME == u.DEFAULT_PREFIX and not force_delete_efs:
147 |       # this is default EFS, likely has stuff, require extra flag to delete it
148 |       print("default EFS has useful stuff in it, not deleting it. Use force-delete-efs "
149 |             "flag to force")
150 |     else:
151 |       delete_efs()
152 |   if 'network' in args.kind or 'all' in args.kind:
153 |     delete_network()
154 |   if 'keypair' in args.kind or 'all' in args.kind:
155 |     delete_keypair()
156 | 
157 | 
158 | if __name__ == '__main__':
159 |   delete_resources(force_delete_efs=args.force_delete_efs)
160 | 


--------------------------------------------------------------------------------
/ncluster/aws_util.py:
--------------------------------------------------------------------------------
  1 | """Methods used in aws_backend, but also useful for standalone prototyping in Jupyter"""
  2 | 
  3 | import os
  4 | import re
  5 | import sys
  6 | import time
  7 | from collections import Iterable
  8 | from collections import OrderedDict
  9 | import paramiko
 10 | from operator import itemgetter
 11 | 
 12 | 
 13 | import boto3
 14 | 
 15 | from . import util
 16 | 
 17 | EMPTY_NAME = "noname"  # name to use when name attribute is missing on AWS
 18 | RETRY_INTERVAL_SEC = 1  # how long to wait before retries
 19 | RETRY_TIMEOUT_SEC = 30  # how long to wait before retrying fails
 20 | DEFAULT_PREFIX = 'ncluster'
 21 | PRIVATE_KEY_LOCATION = os.environ['HOME'] + '/.ncluster'
 22 | DUPLICATE_CHECKING = False
 23 | 
 24 | 
 25 | # Can't annotate boto3 return types because they are missing stubs
 26 | # https://github.com/boto/boto3/issues/1055
 27 | # https://stackoverflow.com/questions/52087307/adding-type-hinting-to-functions-that-return-boto3-objects
 28 | 
 29 | def get_vpc():
 30 |   """
 31 |   Returns current VPC (ec2.Vpc object)
 32 |   https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#vpc
 33 |   """
 34 | 
 35 |   return get_vpc_dict()[get_prefix()]
 36 | 
 37 | 
 38 | def get_security_group():
 39 |   """
 40 |   Returns current security group, ec2.SecurityGroup object
 41 |   https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#securitygroup
 42 | """
 43 |   return get_security_group_dict()[get_prefix()]
 44 | 
 45 | 
 46 | def get_subnet():
 47 |   return get_subnet_dict()[get_zone()]
 48 | 
 49 | 
 50 | def get_vpc_dict():
 51 |   """Returns dictionary of named VPCs {name: vpc}
 52 | 
 53 |   Assert fails if there's more than one VPC with same name."""
 54 | 
 55 |   client = get_ec2_client()
 56 |   response = client.describe_vpcs()
 57 |   assert is_good_response(response)
 58 | 
 59 |   result = OrderedDict()
 60 |   ec2 = get_ec2_resource()
 61 |   for vpc_response in response['Vpcs']:
 62 |     key = get_name(vpc_response.get('Tags', []))
 63 |     if not key or key == EMPTY_NAME:  # skip VPC's that don't have a name assigned
 64 |       continue
 65 | 
 66 |     if key in result:
 67 |       util.log(f"Warning: Duplicate VPC group {key} in {response}")
 68 |       if DUPLICATE_CHECKING:
 69 |         assert False
 70 |     result[key] = ec2.Vpc(vpc_response['VpcId'])
 71 | 
 72 |   return result
 73 | 
 74 | 
 75 | def get_default_vpc():
 76 |   """
 77 |   Return default VPC or none if not present
 78 | 
 79 |   """
 80 |   ec2 = get_ec2_resource()
 81 |   for vpc in ec2.vpcs.all():
 82 |     if vpc.is_default:
 83 |       return vpc
 84 | 
 85 | 
 86 | def get_subnet_dict():
 87 |   """Returns dictionary of "availability zone" -> subnet for current VPC."""
 88 |   subnet_dict = {}
 89 |   vpc = get_vpc()
 90 |   for subnet in vpc.subnets.all():
 91 |     zone = subnet.availability_zone
 92 |     assert zone not in subnet_dict, "More than one subnet in %s, why?" % (zone,)
 93 |     subnet_dict[zone] = subnet
 94 |   return subnet_dict
 95 | 
 96 | 
 97 | def get_gateway_dict(vpc):
 98 |   """Returns dictionary of named gateways for given VPC {name: gateway}"""
 99 |   return {get_name(gateway): gateway for
100 |           gateway in vpc.internet_gateways.all()}
101 | 
102 | 
103 | def get_efs_dict():
104 |   """Returns dictionary of {efs_name: efs_id}"""
105 |   # there's no EC2 resource for EFS objects, so return EFS_ID instead
106 |   # https://stackoverflow.com/questions/47870342/no-ec2-resource-for-efs-objects
107 | 
108 |   efs_client = get_efs_client()
109 |   response = call_with_retries(efs_client.describe_file_systems,
110 |                                'efs_client.describe_file_systems')
111 |   assert is_good_response(response)
112 |   result = OrderedDict()
113 |   for efs_response in response['FileSystems']:
114 |     fs_id = efs_response['FileSystemId']
115 | 
116 |     tag_response = call_with_retries(efs_client.describe_tags,
117 |                                      "efs_client.describe_tags",
118 |                                      FileSystemId=fs_id, retry_interval_sec=2)
119 |     assert is_good_response(tag_response)
120 |     key = get_name(tag_response['Tags'])
121 |     if not key or key == EMPTY_NAME:  # skip EFS's without a name
122 |       continue
123 |     assert key not in result
124 |     result[key] = fs_id
125 | 
126 |   return result
127 | 
128 | 
129 | def get_placement_group_dict():
130 |   """Returns dictionary of {placement_group_name: (state, strategy)}"""
131 | 
132 |   client = get_ec2_client()
133 |   response = client.describe_placement_groups()
134 |   assert is_good_response(response)
135 | 
136 |   result = OrderedDict()
137 |   ec2 = get_ec2_resource()
138 |   for placement_group_response in response['PlacementGroups']:
139 |     key = placement_group_response['GroupName']
140 |     if key in result:
141 |       util.log(f"Warning: Duplicate placement_group group {key}")
142 |       if DUPLICATE_CHECKING:
143 |         assert False
144 |     result[key] = ec2.PlacementGroup(key)
145 |   return result
146 | 
147 | 
148 | def get_security_group_dict():
149 |   """Returns dictionary of named security groups {name: securitygroup}."""
150 | 
151 |   client = get_ec2_client()
152 |   response = client.describe_security_groups()
153 |   assert is_good_response(response)
154 | 
155 |   result = OrderedDict()
156 |   ec2 = get_ec2_resource()
157 |   for security_group_response in response['SecurityGroups']:
158 |     key = get_name(security_group_response.get('Tags', []))
159 |     if not key or key == EMPTY_NAME:
160 |       continue  # ignore unnamed security groups
161 |     #    key = security_group_response['GroupName']
162 |     if key in result:
163 |       util.log(f"Warning: Duplicate security group {key}")
164 |       if DUPLICATE_CHECKING:
165 |         assert key not in result, ("Duplicate security group " + key)
166 |     result[key] = ec2.SecurityGroup(security_group_response['GroupId'])
167 | 
168 |   return result
169 | 
170 | 
171 | def get_keypair_dict():
172 |   """Returns dictionary of {keypairname: keypair}"""
173 | 
174 |   client = get_ec2_client()
175 |   response = client.describe_key_pairs()
176 |   assert is_good_response(response)
177 | 
178 |   result = {}
179 |   ec2 = get_ec2_resource()
180 |   for keypair in response['KeyPairs']:
181 |     keypair_name = keypair.get('KeyName', '')
182 |     if keypair_name in result:
183 |       util.log(f"Warning: Duplicate key {keypair_name}")
184 |     if DUPLICATE_CHECKING:
185 |       assert keypair_name not in result, "Duplicate key " + keypair_name
186 |     result[keypair_name] = ec2.KeyPair(keypair_name)
187 |   return result
188 | 
189 | 
190 | def get_prefix():
191 |   """Global prefix to identify ncluster created resources name used to identify ncluster created resources,
192 |   (name of EFS, VPC, keypair prefixes), can be changed through $NCLUSTER_PREFIX for debugging purposes. """
193 | 
194 |   name = os.environ.get('NCLUSTER_PREFIX', DEFAULT_PREFIX)
195 |   if name != DEFAULT_PREFIX:
196 |     validate_prefix(name)
197 |   return name
198 | 
199 | 
200 | def get_account_number():
201 |   while True:
202 |     try:
203 |       return str(boto3.client('sts').get_caller_identity()['Account'])
204 |     except Exception as e:
205 |       util.log(f'Exception in get_account_number {e}, retrying')
206 |       if 'AWS_SECRET_ACCESS_KEY' not in os.environ:
207 |         util.log(
208 |           'AWS_SECRET_ACCESS_KEY not in env vars, configure your AWS credentials."')
209 |       time.sleep(RETRY_INTERVAL_SEC)
210 | 
211 | 
212 | def get_region():
213 |   return get_session().region_name
214 | 
215 | 
216 | def get_zone() -> str:
217 |   """Returns current zone, or empty string if it's unset."""
218 |   return os.environ.get('NCLUSTER_ZONE', '')
219 | 
220 | 
221 | def get_zones():
222 |   client = get_ec2_client()
223 |   response = client.describe_availability_zones()
224 |   assert is_good_response(response)
225 |   zones = []
226 |   for avail_response in response['AvailabilityZones']:
227 |     messages = avail_response['Messages']
228 |     zone = avail_response['ZoneName']
229 |     state = avail_response['State']
230 |     assert not messages, f"zone {zone} is broken? Has messages {messages}"
231 |     assert state == 'available', f"zone {zone} is broken? Has state {state}"
232 |     zones.append(zone)
233 |   return zones
234 | 
235 | 
236 | def get_session():
237 |   # in future can add aws profile support with Session(profile_name=...)
238 |   return boto3.Session()
239 | 
240 | 
241 | ################################################################################
242 | # keypairs
243 | ################################################################################
244 | # For naming conventions, see
245 | # https://docs.google.com/document/d/14-zpee6HMRYtEfQ_H_UN9V92bBQOt0pGuRKcEJsxLEA/edit#heading=h.45ok0839c0a
246 | 
247 | def get_keypair_name():
248 |   """Returns current keypair name."""
249 | 
250 |   username = get_username()
251 |   assert '-' not in username, "username must not contain -, change $USER"
252 |   validate_aws_name(username)
253 |   assert len(username) < 30  # to avoid exceeding AWS 127 char limit
254 |   return get_prefix() + '-' + username
255 | 
256 | 
257 | def get_keypair():
258 |   """Returns current keypair (ec2.KeyPairInfo)
259 | 
260 |   https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#keypairinfo
261 |   """
262 | 
263 |   return get_keypair_dict()[get_keypair_name()]
264 | 
265 | 
266 | def get_keypair_fn():
267 |   """Location of .pem file for current keypair"""
268 | 
269 |   keypair_name = get_keypair_name()
270 |   account = get_account_number()
271 |   region = get_region()
272 |   fn = f'{PRIVATE_KEY_LOCATION}/{keypair_name}-{account}-{region}.pem'
273 |   return fn
274 | 
275 | 
276 | def get_vpc_name():
277 |   return get_prefix()
278 | 
279 | 
280 | def get_security_group_name():
281 |   # We have two security groups, ncluster for manually created VPC and
282 |   # ncluster-default for default VPC. Once default VPC works for all cases, can
283 |   # get rid of one of security groups
284 |   return get_prefix()
285 | 
286 | 
287 | def get_gateway_name():
288 |   return get_prefix()
289 | 
290 | 
291 | def get_route_table_name():
292 |   return get_prefix()
293 | 
294 | 
295 | def get_efs_name():
296 |   return get_prefix()
297 | 
298 | 
299 | def get_username():
300 |   assert 'USER' in os.environ, "why isn't USER defined?"
301 |   return os.environ['USER']
302 | 
303 | 
304 | def lookup_image(wildcard):
305 |   """Returns unique ec2.Image whose name matches wildcard
306 |   lookup_ami('pytorch*').name => ami-29fa
307 |   
308 |   https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#image
309 | 
310 |   Assert fails if multiple images match or no images match.
311 |   """
312 | 
313 |   ec2 = get_ec2_resource()
314 |   filter_ = {'Name': 'name', 'Values': [wildcard]}
315 | 
316 |   images = list(ec2.images.filter(Filters=[filter_]))
317 | 
318 |   # Note, can add filtering by Owners as follows
319 |   #  images = list(ec2.images.filter_(Filters = [filter_], Owners=['self', 'amazon']))
320 | 
321 |   assert len(images) <= 1, "Multiple images match " + str(wildcard)
322 |   assert len(images) > 0, "No images match " + str(wildcard)
323 |   return images[0]
324 | 
325 | 
326 | def lookup_instance(name: str, instance_type: str = '', image_name: str = '',
327 |                     states: tuple = ('running', 'stopped', 'initializing')):
328 |   """Looks up AWS instance for given instance name, like
329 |    simple.worker. If no instance found in current AWS environment, returns None. """
330 | 
331 |   ec2 = get_ec2_resource()
332 | 
333 |   instances = ec2.instances.filter(
334 |     Filters=[{'Name': 'instance-state-name', 'Values': states}])
335 | 
336 |   prefix = get_prefix()
337 |   username = get_username()
338 | 
339 |   # look for an existing instance matching job, ignore instances launched
340 |   # by different user or under different resource name
341 |   result = []
342 |   for i in instances.all():
343 |     instance_name = get_name(i)
344 |     if instance_name != name:
345 |       continue
346 | 
347 |     seen_prefix, seen_username = parse_key_name(i.key_name)
348 |     if prefix != seen_prefix:
349 |       print(f"Found {name} launched under {seen_prefix}, ignoring")
350 |       continue
351 |     if username != seen_username:
352 |       print(f"Found {name} launched by {seen_username}, ignoring")
353 |       continue
354 | 
355 |     if instance_type:
356 |       assert i.instance_type == instance_type, f"Found existing instance for job {name} but different instance type ({i.instance_type}) than requested ({instance_type}), terminate {name} first or use new task name."
357 | 
358 |     if image_name:
359 |       assert i.image.name == image_name, f"Found existing instance for job {name} but launched with different image ({i.image.name}) than requested ({image_name}), terminate {name} first or use new task name."
360 |     result.append(i)
361 | 
362 |     assert len(result) < 2, f"Found two instances with name {name}"
363 |     if not result:
364 |       return None
365 |     else:
366 |       return result[0]
367 | 
368 | 
369 | def ssh_to_task(task) -> paramiko.SSHClient:
370 |   """Create ssh connection to task's machine
371 | 
372 |   returns Paramiko SSH client connected to host.
373 | 
374 |   """
375 | 
376 |   username = task.ssh_username
377 |   hostname = task.public_ip
378 |   ssh_key_fn = get_keypair_fn()
379 |   print(f"ssh -i {ssh_key_fn} {username}@{hostname}")
380 |   pkey = paramiko.RSAKey.from_private_key_file(ssh_key_fn)
381 | 
382 |   ssh_client = paramiko.SSHClient()
383 |   ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
384 |   assert ssh_client
385 | 
386 |   counter = 1
387 |   while True:
388 |     try:
389 |       ssh_client.connect(hostname=hostname, username=username, pkey=pkey)
390 |       if counter % 11 == 0:  # occasionally re-obtain public ip, machine could've gotten restarted
391 |         hostname = task.public_ip
392 |       break
393 |     except Exception as e:
394 |       print(
395 |         f'{task.name}: Exception connecting to {hostname} via ssh (could be a timeout): {e}')
396 |       time.sleep(RETRY_INTERVAL_SEC)
397 | 
398 |   return ssh_client
399 | 
400 | 
401 | def parse_key_name(keyname):
402 |   """keyname => resource, username"""
403 |   # Relies on resource name not containing -, validated in
404 |   # validate_resource_name
405 |   toks = keyname.split('-')
406 |   if len(toks) != 2:
407 |     return None, None  # some other keyname not launched by nexus
408 |   else:
409 |     return toks
410 | 
411 | 
412 | aws_name_regexp = re.compile('^[a-zA-Z0-9+-=._:/@]*$')
413 | 
414 | 
415 | def validate_aws_name(name):
416 |   """Validate resource name using AWS name restrictions from # http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html#tag-restrictions"""
417 |   assert len(name) <= 127
418 |   # disallow unicode characters to avoid pain
419 |   assert name == name.encode('ascii').decode('ascii')
420 |   assert aws_name_regexp.match(name)
421 | 
422 | 
423 | resource_regexp = re.compile('^[a-z0-9]+$')
424 | 
425 | 
426 | def validate_prefix(name):
427 |   """Check that name is valid as substitute for default prefix. Since it's used in unix filenames, key names, be more conservative than AWS requirements, just allow 30 chars, lowercase only."""
428 |   assert len(name) <= 30
429 |   assert resource_regexp.match(name)
430 |   validate_aws_name(name)
431 | 
432 | 
433 | def validate_run_name(name):
434 |   """Name used for run. Used as part of instance name, tmux session name."""
435 |   assert len(name) <= 30
436 |   validate_aws_name(name)
437 | 
438 | 
439 | def create_name_tags(name):
440 |   """Returns [{'Key': 'Name', 'Value': name}] """
441 |   return [{'Key': 'Name', 'Value': name}]
442 | 
443 | 
444 | def create_efs(name):
445 |   efs_client = get_efs_client()
446 |   token = str(int(time.time() * 1e6))  # epoch usec
447 | 
448 |   response = efs_client.create_file_system(CreationToken=token,
449 |                                            PerformanceMode='generalPurpose')
450 |   assert is_good_response(response)
451 |   start_time = time.time()
452 |   while True:
453 |     try:
454 | 
455 |       response = efs_client.create_file_system(CreationToken=token,
456 |                                                PerformanceMode='generalPurpose')
457 |       assert is_good_response(response)
458 |       time.sleep(RETRY_INTERVAL_SEC)
459 |     except Exception as e:
460 |       if 'FileSystemAlreadyExists' in str(e):
461 |         break
462 |       if response['Error']['Code'] == 'FileSystemAlreadyExists':
463 |         break
464 |       else:
465 |         util.log_error(e)
466 |       break
467 | 
468 |     if time.time() - start_time - RETRY_INTERVAL_SEC > RETRY_TIMEOUT_SEC:
469 |       assert False, "Timeout exceeded creating EFS %s (%s)" % (token, name)
470 | 
471 |     time.sleep(RETRY_TIMEOUT_SEC)
472 | 
473 |   # find efs id from given token
474 |   response = efs_client.describe_file_systems()
475 |   assert is_good_response(response)
476 |   fs_id = extract_attr_for_match(response['FileSystems'], FileSystemId=-1,
477 |                                  CreationToken=token)
478 |   response = efs_client.create_tags(FileSystemId=fs_id,
479 |                                     Tags=create_name_tags(name))
480 |   assert is_good_response(response)
481 | 
482 |   # make sure EFS is now visible
483 |   efs_dict = get_efs_dict()
484 |   assert name in efs_dict
485 |   return efs_dict[name]
486 | 
487 | 
488 | def delete_efs_by_id(efs_id):
489 |   """Deletion sometimes fails, try several times."""
490 |   start_time = time.time()
491 |   efs_client = get_efs_client()
492 |   sys.stdout.write("deleting %s ... " % (efs_id,))
493 |   while True:
494 |     try:
495 |       response = efs_client.delete_file_system(FileSystemId=efs_id)
496 |       if is_good_response(response):
497 |         print("succeeded")
498 |         break
499 |       time.sleep(RETRY_INTERVAL_SEC)
500 |     except Exception as e:
501 |       print("Failed with %s" % (e,))
502 |       if time.time() - start_time - RETRY_INTERVAL_SEC < RETRY_TIMEOUT_SEC:
503 |         print("Retrying in %s sec" % (RETRY_INTERVAL_SEC,))
504 |         time.sleep(RETRY_INTERVAL_SEC)
505 |       else:
506 |         print("Giving up")
507 |         break
508 | 
509 | 
510 | def extract_attr_for_match(items, **kwargs):
511 |   """Helper method to get attribute value for an item matching some criterion.
512 |   Specify target criteria value as dict, with target attribute having value -1
513 | 
514 |   Example:
515 |     to extract state of vpc matching given vpc id
516 | 
517 |   response = [{'State': 'available', 'VpcId': 'vpc-2bb1584c'}]
518 |   extract_attr_for_match(response, State=-1, VpcId='vpc-2bb1584c') #=> 'available'"""
519 | 
520 |   # find the value of attribute to return
521 |   query_arg = None
522 |   for arg, value in kwargs.items():
523 |     if value == -1:
524 |       assert query_arg is None, "Only single query arg (-1 valued) is allowed"
525 |       query_arg = arg
526 |   result = []
527 | 
528 |   filterset = set(kwargs.keys())
529 |   for item in items:
530 |     match = True
531 |     assert filterset.issubset(
532 |       item.keys()), "Filter set contained %s which was not in record %s" % (
533 |       filterset.difference(item.keys()),
534 |       item)
535 |     for arg in item:
536 |       if arg == query_arg:
537 |         continue
538 |       if arg in kwargs:
539 |         if item[arg] != kwargs[arg]:
540 |           match = False
541 |           break
542 |     if match:
543 |       result.append(item[query_arg])
544 |   assert len(result) <= 1, "%d values matched %s, only allow 1" % (
545 |     len(result), kwargs)
546 |   if result:
547 |     return result[0]
548 |   return None
549 | 
550 | 
551 | def get_tags(instance):
552 |   """Returns instance tags."""
553 | 
554 |   return get_instance_property(instance, 'tags')
555 | 
556 | 
557 | def get_public_ip(instance):
558 |   return get_instance_property(instance, 'public_ip_address')
559 | 
560 | 
561 | def get_ip(instance):
562 |   return get_instance_property(instance, 'private_ip_address')
563 | 
564 | 
565 | def get_instance_property(instance, property_name):
566 |   """Retrieves property of an instance, keeps retrying until getting a non-None"""
567 | 
568 |   name = get_name(instance)
569 |   while True:
570 |     try:
571 |       value = getattr(instance, property_name)
572 |       if value is not None:
573 |         break
574 |       print(f"retrieving {property_name} on {name} produced None, retrying")
575 |       time.sleep(RETRY_INTERVAL_SEC)
576 |       instance.reload()
577 |       continue
578 |     except Exception as e:
579 |       print(f"retrieving {property_name} on {name} failed with {e}, retrying")
580 |       time.sleep(RETRY_INTERVAL_SEC)
581 |       try:
582 |         instance.reload()
583 |       except Exception:
584 |         pass
585 |       continue
586 | 
587 |   return value
588 | 
589 | 
590 | def call_with_retries(method, debug_string='',
591 |                       retry_interval_sec=RETRY_INTERVAL_SEC,
592 |                       **kwargs):
593 |   while True:
594 |     try:
595 |       value = method(**kwargs)
596 |       assert value is not None, f"{debug_string} was None"
597 |       break
598 |     except Exception as e:
599 |       print(f"{debug_string} failed with {e.__class__}({e}), retrying")
600 |       time.sleep(retry_interval_sec)
601 |       continue
602 | 
603 |   return value
604 | 
605 | 
606 | def get_ec2_resource():
607 |   try:
608 |     client = get_session().resource('ec2')
609 |   except Exception as e:
610 |     print(f"Failed with error '{e}'")
611 |     print("To specify Virginia region, do 'export AWS_DEFAULT_REGION=us-east-1'")
612 |     sys.exit()
613 |   return client
614 | 
615 | 
616 | def get_ec2_client():
617 |   try:
618 |     client = get_session().client('ec2')
619 |   except Exception as e:
620 |     print(f"Failed with error '{e}'")
621 |     print("To specify Virginia region, do 'export AWS_DEFAULT_REGION=us-east-1'")
622 |     sys.exit()
623 |   return client
624 | 
625 | 
626 | def get_efs_client():
627 |   while True:
628 |     try:
629 |       return get_session().client('efs')
630 |     except Exception as e:
631 |       # can get following
632 |       # botocore.exceptions.DataNotFoundError: Unable to load data for: endpoints
633 |       util.log(f"get_session().client('efs') failed with {e}, retrying")
634 |       time.sleep(2)
635 | 
636 | 
637 | def is_good_response(response):
638 |   """Helper method to check if boto3 call was a success."""
639 | 
640 |   code = response["ResponseMetadata"]['HTTPStatusCode']
641 |   # get response code 201 on EFS creation
642 |   return 200 <= code < 300
643 | 
644 | 
645 | def get_name(tags_or_instance_or_id):
646 |   """Helper utility to extract name out of tags dictionary or intancce.
647 |       [{'Key': 'Name', 'Value': 'nexus'}] -> 'nexus'
648 |  
649 |      Assert fails if there's more than one name.
650 |      Returns '' if there's less than one name.
651 |   """
652 | 
653 |   ec2 = get_ec2_resource()
654 |   if hasattr(tags_or_instance_or_id, 'tags'):
655 |     tags = tags_or_instance_or_id.tags
656 |   elif isinstance(tags_or_instance_or_id, str):
657 |     tags = ec2.Instance(tags_or_instance_or_id).tags
658 |   elif tags_or_instance_or_id is None:
659 |     return EMPTY_NAME
660 |   else:
661 |     assert isinstance(tags_or_instance_or_id,
662 |                       Iterable), "expected iterable of tags"
663 |     tags = tags_or_instance_or_id
664 | 
665 |   if not tags:
666 |     return EMPTY_NAME
667 |   names = [entry['Value'] for entry in tags if entry['Key'] == 'Name']
668 |   if not names:
669 |     return ''
670 |   if len(names) > 1:
671 |     assert False, "have more than one name: " + str(names)
672 |   return names[0]
673 | 
674 | 
675 | def wait_until_available(resource):
676 |   """Waits until interval state becomes 'available'"""
677 |   while True:
678 |     resource.load()
679 |     if resource.state == 'available':
680 |       break
681 |     time.sleep(RETRY_INTERVAL_SEC)
682 | 
683 | 
684 | def maybe_create_placement_group(name='', max_retries=10):
685 |   """Creates placement_group group or reuses existing one. Crash if unable to create
686 |     placement_group group. If name is empty, ignores request."""
687 | 
688 |   if not name:
689 |     return
690 | 
691 |   client = get_ec2_client()
692 |   while True:
693 |     try:
694 |       client.describe_placement_groups(GroupNames=[name])
695 |       print("Reusing placement_group group: " + name)
696 |       break  # no Exception means group name was found
697 |     except Exception:
698 |       print("Creating placement_group group: " + name)
699 |       try:
700 |         _response = client.create_placement_group(GroupName=name,
701 |                                                   Strategy='cluster')
702 |       except Exception:
703 |         # because of race can get InvalidPlacementGroup.Duplicate
704 |         pass
705 | 
706 |   counter = 0
707 |   while True:
708 |     try:
709 |       res = client.describe_placement_groups(GroupNames=[name])
710 |       res_entry = res['PlacementGroups'][0]
711 |       if res_entry['State'] == 'available':
712 |         assert res_entry['Strategy'] == 'cluster'
713 |         break
714 |     except Exception as e:
715 |       print("Got exception: %s" % (e,))
716 |     counter += 1
717 |     if counter >= max_retries:
718 |       assert False, f'Failed to create placement_group group {name} in {max_retries} attempts'
719 |     time.sleep(RETRY_INTERVAL_SEC)
720 | 
721 | 
722 | # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#concepts-placement-groups
723 | def instance_supports_placement_groups(instance_type: str):
724 |   regex = re.compile(
725 |     "^(m4|m5|m5d|c3|c4|c5|c5d|cc2.8xlarge|cr1.8xlarge|r3|r4|r5|r5d|x1|x1e|z1d|d2|h1|hs1.8xlarge|i2|i3|i3.metal|f1|g2|g3|p2|p3).*$",
726 |     re.IGNORECASE)
727 |   return regex.match(instance_type)
728 | 
729 | 
730 | def lookup_instances(fragment, verbose=True, filter_by_key=True):
731 |   """Returns ec2.Instance object whose name contains fragment, in reverse order of launching (ie,
732 |   most recent intance first). Optionally filters by key, only including instances launched with
733 |   key_name matching current username.
734 | 
735 |   args:
736 |     verbose: print information about all matching instances found
737 | 
738 |     filter_by_key  if True, ignore instances that are not launched with current
739 |         user's default key
740 |   """
741 | 
742 |   def vprint(*args):
743 |     if verbose:
744 |       print(*args)
745 | 
746 |   region = get_region()
747 |   client = get_ec2_client()
748 |   ec2 = get_ec2_resource()
749 |   response = client.describe_instances()
750 |   assert is_good_response(response)
751 | 
752 |   instance_list = []
753 |   for instance in ec2.instances.all():
754 |     if instance.state['Name'] != 'running':
755 |       continue
756 | 
757 |     name = get_name(instance)
758 |     if (fragment in name or fragment in str(instance.public_ip_address) or
759 |             fragment in str(instance.id) or fragment in str(instance.private_ip_address)):
760 |       instance_list.append((util.toseconds(instance.launch_time), instance))
761 | 
762 |   sorted_instance_list = reversed(sorted(instance_list, key=itemgetter(0)))
763 |   filtered_instance_list = []  # filter by key
764 |   vprint("Using region ", region)
765 |   for (ts, instance) in sorted_instance_list:
766 |     if filter_by_key and instance.key_name != get_keypair_name():
767 |       vprint(f"Got key {instance.key_name}, expected {get_keypair_name()}")
768 |       continue
769 |     filtered_instance_list.append(instance)
770 |   return filtered_instance_list
771 | 
772 | def create_spot_instances(launch_specs, spot_price=26, expiration_mins=15):
773 |     """
774 |     args:
775 |       spot_price: default is $26 which is right above p3.16xlarge on demand price
776 |       expiration_mins: this request only valid for this many mins from now
777 |     """
778 |     ec2c = get_ec2_client()
779 | 
780 |     num_tasks = launch_specs['MinCount'] or 1
781 |     if 'MinCount' in launch_specs: del launch_specs['MinCount']
782 |     if 'MaxCount' in launch_specs: del launch_specs['MaxCount']
783 |     if 'TagSpecifications' in launch_specs: 
784 |       try: tags = launch_specs['TagSpecifications'][0]['Tags']
785 |       except: pass
786 |       del launch_specs['TagSpecifications']
787 | 
788 |     import pytz      # datetime is not timezone aware, use pytz to fix
789 |     import datetime as dt
790 |     now = dt.datetime.utcnow().replace(tzinfo=pytz.utc)
791 | 
792 |     spot_args = {}
793 |     spot_args['LaunchSpecification'] = launch_specs
794 |     spot_args['SpotPrice'] = str(spot_price)
795 |     spot_args['InstanceCount'] = num_tasks
796 |     spot_args['ValidUntil'] = now + dt.timedelta(minutes=expiration_mins)
797 |     
798 |     try:
799 |       spot_requests = ec2c.request_spot_instances(**spot_args)
800 |     except Exception as e:
801 |       assert False, f"Spot instance request failed (out of capacity?), error was {e}"
802 |       
803 |     spot_requests = spot_requests['SpotInstanceRequests']
804 |     instance_ids = wait_on_fulfillment(ec2c, spot_requests)
805 |     
806 |     print('Instances fullfilled...')
807 |     ec2 = get_ec2_resource()
808 |     instances = list(ec2.instances.filter(Filters=[{'Name': 'instance-id', 'Values': list(filter(None, instance_ids))}]))
809 | 
810 |     if not all(instance_ids):
811 |       for i in instances: 
812 |         i.terminate()
813 |       raise RuntimeError('Failed to create spot instances:', instance_ids)
814 | 
815 |     if tags:
816 |       for i in instances:
817 |           i.create_tags(Tags=tags)
818 | 
819 |     return instances
820 | 
821 | 
822 | def wait_on_fulfillment(ec2c, reqs):
823 |     def get_instance_id(req):
824 |       while req['State'] != 'active':
825 |           print('Waiting on spot fullfillment...')
826 |           time.sleep(5)
827 |           reqs = ec2c.describe_spot_instance_requests(Filters=[{'Name': 'spot-instance-request-id', 'Values': [req['SpotInstanceRequestId']]}])
828 |           if not reqs['SpotInstanceRequests']:
829 |             print(f"SpotInstanceRequest for {req['SpotInstanceRequestId']} not found")
830 |             continue
831 |           req = reqs['SpotInstanceRequests'][0]
832 |           req_status = req['Status']
833 |           if req_status['Code'] not in ['pending-evaluation', 'pending-fulfillment', 'fulfilled']:
834 |               print('Spot instance request failed:', req_status['Message'])
835 |               print('Cancelling request. Please try again or use on demand.')
836 |               ec2c.cancel_spot_instance_requests(SpotInstanceRequestIds=[req['SpotInstanceRequestId']])
837 |               print(req)
838 |               return None
839 |       instance_id = req['InstanceId']
840 |       print('Fulfillment completed. InstanceId:', instance_id)
841 |       return instance_id
842 |     return [get_instance_id(req) for req in reqs]
843 | 


--------------------------------------------------------------------------------
/ncluster/backend.py:
--------------------------------------------------------------------------------
  1 | """Interface for job launching backend.
  2 | 
  3 | Run/Job and Task are container classes encapsulating functionality.
  4 | User creates them through make_run/make_job/make_task methods
  5 | 
  6 | """
  7 | # Job launcher Python API: https://docs.google.com/document/d/1yTkb4IPJXOUaEWksQPCH7q0sjqHgBf3f70cWzfoFboc/edit
  8 | # AWS job launcher (concepts): https://docs.google.com/document/d/1IbVn8_ckfVO3Z9gIiE0b9K3UrBRRiO9HYZvXSkPXGuw/edit
  9 | import threading
 10 | import time
 11 | from typing import List, Tuple, Any
 12 | 
 13 | from . import util
 14 | 
 15 | # aws_backend.py
 16 | # local_backend.py
 17 | 
 18 | LOGDIR_ROOT: str = None  # location of logdir for this backend
 19 | 
 20 | """
 21 | backend = aws_backend # alternatively, backend=tmux_backend to launch jobs locally in separate tmux sessions
 22 | run = backend.make_run("helloworld")  # sets up /efs/runs/helloworld
 23 | worker_job = run.make_job("worker", instance_type="g3.4xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
 24 | ps_job = run.make_job("ps", instance_type="c5.xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
 25 | setup_tf_config(worker_job, ps_job)
 26 | ps_job.run("python cifar10_main.py --num_gpus=0")  # runs command on each task
 27 | worker_job.run("python cifar10_main.py --num_gpus=4")
 28 | 
 29 | tb_job = run.make_job("tb", instance_type="m4.xlarge", num_tasks=1, public_port=6006)
 30 | tb_job.run("tensorboard --logdir=%s --port=%d" %(run.logdir, 6006))
 31 | # when job has one task, job.task[0].ip can be accessed as job.ip
 32 | print("See TensorBoard progress on %s:%d" %(tb_job.ip, 6006))
 33 | print("To interact with workers: %s" %(worker_job.connect_instructions))
 34 | 
 35 | 
 36 | To reconnect to existing job:
 37 | 
 38 | """
 39 | 
 40 | 
 41 | class Task:
 42 |   name: str
 43 |   ip: str
 44 |   public_ip: str
 45 |   run_counter: int
 46 |   # location where temporary files from interfacing with task go locally
 47 |   local_scratch: str
 48 |   # location where temporary files from interfacing with task go on task
 49 |   remote_scratch: str
 50 |   job: Any  # can't declare Job because of circular dependency
 51 | 
 52 |   def __init__(self, name=''):
 53 |     """Wraps execution resources into a task. Runs install_script if present"""
 54 |     self.last_status = None
 55 |     self.name = name
 56 |     self.instance = None
 57 |     self.install_script = None
 58 |     self.job = None
 59 |     self.kwargs = None
 60 |     self.public_ip = None
 61 |     self.ip = None
 62 |     self.logdir_ = None
 63 | 
 64 |   @property
 65 |   def logdir(self):
 66 |     raise NotImplementedError()
 67 | 
 68 |   def run(self, cmd: str, non_blocking=False, ignore_errors=False):
 69 |     """Runs command on given task."""
 70 |     raise NotImplementedError()
 71 | 
 72 |   def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \
 73 |           Tuple[str, str]:
 74 |     """
 75 | 
 76 |     Args:
 77 |       cmd: single line shell command to run
 78 |       non_blocking (bool): if True, does not wait for command to finish
 79 |       ignore_errors: if True, will succeed even if command failed
 80 | 
 81 |     Returns:
 82 |       Contents of stdout/stderr as strings.
 83 |     Raises
 84 |       RuntimeException: if command produced non-0 returncode
 85 | 
 86 |     """
 87 | 
 88 |     assert '\n' not in cmd, "Do not support multi-line commands"
 89 |     cmd: str = cmd.strip()
 90 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
 91 |       return '', ''
 92 | 
 93 |     stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout"
 94 |     stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr"
 95 |     cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}"
 96 | 
 97 |     assert not non_blocking, "Getting output doesn't work with non_blocking"
 98 |     status = self.run(cmd2, False, ignore_errors=True)
 99 |     stdout = self.read(stdout_fn)
100 |     stderr = self.read(stderr_fn)
101 | 
102 |     if self.last_status > 0:
103 |       self.log(f"Warning: command '{cmd}' returned {status},"
104 |                f" stdout was '{stdout}' stderr was '{stderr}'")
105 |       if not ignore_errors:
106 |         raise RuntimeError(f"Warning: command '{cmd}' returned {status},"
107 |                            f" stdout was '{stdout}' stderr was '{stderr}'")
108 | 
109 |     return stdout, stderr
110 | 
111 |   def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365,
112 |                     check_interval: float = 0.02) -> bool:
113 |     """
114 |     Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec
115 |     Args:
116 |       fn: filename on task machine
117 |       max_wait_sec: how long to wait in seconds
118 |       check_interval: how often to check in seconds
119 |     Returns:
120 |       False if waiting was was cut short by max_wait_sec limit, True otherwise
121 |     """
122 |     print("Waiting for file", fn)
123 |     start_time = time.time()
124 |     while True:
125 |       if time.time() - start_time > max_wait_sec:
126 |         util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}")
127 |         return False
128 |       if not self.exists(fn):
129 |         time.sleep(check_interval)
130 |         continue
131 |       else:
132 |         break
133 |     return True
134 | 
135 |   def _run_raw(self, cmd):
136 |     """Runs command directly on every task in the job, skipping tmux interface. Use if want to create/manage additional tmux sessions manually."""
137 |     raise NotImplementedError()
138 | 
139 |   def upload(self, local_fn: str, remote_fn: str = '',
140 |              dont_overwrite: bool = False):
141 |     """Uploads given file to the task. If remote_fn is not specified, dumps it
142 |     into task current directory with the same name.
143 | 
144 |     Args:
145 |       local_fn: location of file locally
146 |       remote_fn: location of file on task
147 |       dont_overwrite: if True, will be no-op if target file exists
148 |       """
149 |     raise NotImplementedError()
150 | 
151 |   def download(self, remote_fn: str, local_fn: str = ''):
152 |     """Downloads remote file to current directory."""
153 |     raise NotImplementedError()
154 | 
155 |   def write(self, fn, contents):
156 |     """Write string contents to file fn in task."""
157 |     raise NotImplementedError()
158 | 
159 |   def read(self, fn):
160 |     """Read contents of file and return it as string."""
161 |     raise NotImplementedError()
162 | 
163 |   def exists(self, fn) -> bool:
164 |     """Checks if fn exists on task
165 | 
166 |     Args:
167 |       fn: filename local to task
168 |     Returns:
169 |       true if fn exists on task machine
170 |     """
171 |     raise NotImplementedError()
172 | 
173 |   def log(self, message, *args):
174 |     """Log to launcher console."""
175 |     if args:
176 |       message %= args
177 | 
178 |     print(f"{util.current_timestamp()} {self.name}: {message}")
179 | 
180 | 
181 | class Job:
182 |   name: str
183 |   tasks: List[Task]
184 | 
185 |   #  run_: Run
186 | 
187 |   def __init__(self, name: str, tasks: List[Task] = None, **kwargs):
188 |     """Initializes Job object, links tasks to refer back to the Job."""
189 |     if tasks is None:
190 |       tasks = []
191 |     self.name = name
192 |     self.tasks = tasks
193 |     self.kwargs = kwargs
194 |     # TODO: maybe backlinking is not needed
195 |     for task in tasks:
196 |       task.job = self
197 | 
198 |   @property
199 |   def logdir(self):
200 |     return self.tasks[0].logdir
201 | 
202 |   def _non_blocking_wrapper(self, method, *args, **kwargs):
203 |     """Runs given method on every task in the job. Blocks until all tasks finish. Propagates exception from first
204 |     failed task."""
205 | 
206 |     exceptions = []
207 | 
208 |     def task_run(task):
209 |       try:
210 |         getattr(task, method)(*args, **kwargs)
211 |       except Exception as e:
212 |         exceptions.append(e)
213 | 
214 |     threads = [threading.Thread(name=f'task_{method}_{i}',
215 |                                 target=task_run, args=[t])
216 |                for i, t in enumerate(self.tasks)]
217 |     for thread in threads:
218 |       thread.start()
219 |     for thread in threads:
220 |       thread.join()
221 |     if exceptions:
222 |       raise exceptions[0]
223 | 
224 |   def run(self, *args, **kwargs):
225 |     """Runs command on every task in the job in parallel, blocks until all tasks finish.
226 |     See Task for documentation of args/kwargs."""
227 |     return self._non_blocking_wrapper("run", *args, **kwargs)
228 | 
229 |   def run_with_output(self, *args, **kwargs):
230 |     """Runs command on every task in the job in parallel, blocks until all tasks finish.
231 |     See Task for documentation of args/kwargs."""
232 |     return self._non_blocking_wrapper("run_with_output", *args, **kwargs)
233 | 
234 |   def upload(self, *args, **kwargs):
235 |     """See :py:func:`backend.Task.upload`"""
236 |     return self._non_blocking_wrapper("upload", *args, **kwargs)
237 | 
238 |   def write(self, *args, **kwargs):
239 |     return self._non_blocking_wrapper("write", *args, **kwargs)
240 | 
241 |   def _run_raw(self, *args, **kwargs):
242 |     return self._non_blocking_wrapper("_run_raw", *args, **kwargs)
243 | 
244 | 
245 | # Implementation needs to be backend specific so that run.create_job calls backend-specific method
246 | class Run:
247 |   """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter
248 |   server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and
249 |   event files.
250 |   :ivar aws_placement_group_name: somedoc
251 |   """
252 |   jobs: List[Job]
253 | 
254 |   @property
255 |   def logdir(self):
256 |     raise NotImplementedError()
257 | 
258 |   # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods
259 |   def run(self, *args, **kwargs):
260 |     raise NotImplementedError()
261 | 
262 |   def run_with_output(self, *args, **kwargs):
263 |     raise NotImplementedError()
264 | 
265 |   def _run_raw(self, *args, **kwargs):
266 |     raise NotImplementedError()
267 | 
268 |   def upload(self, *args, **kwargs):
269 |     raise NotImplementedError()
270 | 
271 |   def make_job(self, name='', **kwargs):
272 |     raise NotImplementedError()
273 | 
274 | 
275 | def make_task(**_kwargs):
276 |   raise NotImplementedError()
277 | 
278 | 
279 | def make_job(**_kwargs):
280 |   raise NotImplementedError()
281 | 
282 | 
283 | def make_run(**_kwargs):
284 |   raise NotImplementedError()
285 | 


--------------------------------------------------------------------------------
/ncluster/local_backend.py:
--------------------------------------------------------------------------------
  1 | """Local implementation of backend.py using separate tmux sessions for jobs.
  2 | 
  3 | Not thread-safe.
  4 | """
  5 | 
  6 | import glob
  7 | import os
  8 | import shlex
  9 | import socket
 10 | import time
 11 | from typing import List
 12 | 
 13 | from ncluster import ncluster_globals
 14 | from . import backend
 15 | from . import util
 16 | 
 17 | TASKDIR_ROOT = '/tmp/ncluster/task'
 18 | SCRATCH_ROOT = '/tmp/ncluster/scratch'
 19 | LOGDIR_ROOT = os.environ[
 20 |                 'HOME'] + '/ncluster/runs'  # use ~ instead of /tmp because /tmp gets wiped
 21 | 
 22 | 
 23 | # todo: tmux session names are backwards from AWS job names (runname-jobname)
 24 | # TODO: add kwargs so that tmux backend can be drop-in replacement
 25 | 
 26 | 
 27 | # TODO: rename extra_kwargs to kwargs everywhere
 28 | class Task(backend.Task):
 29 |   """Local tasks interact with tmux session where session name is derived
 30 |   from job name, and window names are task ids."""
 31 |   tmux_window_id: int
 32 |   tmux_available_window_ids: List[int]
 33 | 
 34 |   def __init__(self, name, *, tmux_session, install_script='', job=None,
 35 |                **kwargs):
 36 | 
 37 |     self.homedir = os.environ['HOME']
 38 |     self._cmd_fn = None
 39 |     self._cmd = None
 40 |     self._status_fn = None  # location of output of last status
 41 |     self._out_fn = None
 42 | 
 43 |     self._can_run = False
 44 |     self.tmux_session = tmux_session
 45 |     self.tmux_window_id = 0
 46 |     self.tmux_available_window_ids = [0]
 47 | 
 48 |     self.name = name
 49 |     self.install_script = install_script
 50 |     self.job = job
 51 |     self.kwargs = kwargs
 52 | 
 53 |     # local servers sometimes listen only on localhost (TensorBoard), and sometimes only on
 54 |     # externally assigned ip address from gethostbyname (Ray), must choose one, so use the localhost for TB compatibility
 55 |     # https://github.com/ray-project/ray/issues/1677
 56 |     self.public_ip = socket.gethostbyname(socket.gethostname())
 57 |     #  self.public_ip = '127.0.0.1'
 58 |     self.ip = self.public_ip
 59 | 
 60 |     self.connect_instructions = 'tmux a -t ' + self.tmux_session
 61 | 
 62 |     # task current dir
 63 |     print('name is', name)
 64 |     # tmpdir = f"{util.reverse_taskname(name)}.{os.getpid()}.{util.now_micros()}"
 65 |     launch_id = util.random_id()
 66 |     self.taskdir = f"{TASKDIR_ROOT}/{name}-{launch_id}"
 67 |     self.local_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}"
 68 |     self.remote_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}"
 69 | 
 70 |     self.log(f"Creating taskdir {self.taskdir}")
 71 |     self._run_raw('mkdir -p ' + self.taskdir)
 72 | 
 73 |     self.log(f"Creating scratch {self.local_scratch}")
 74 |     self._run_raw('rm -Rf ' + self.local_scratch)
 75 |     self._run_raw('mkdir -p ' + self.local_scratch)
 76 |     self._run_raw('mkdir -p ' + self.remote_scratch)
 77 |     self.run_counter = 0
 78 | 
 79 |     self._cwd = self.taskdir
 80 |     self._can_run = True
 81 |     self.run('cd ' + self.taskdir)
 82 | 
 83 |     print("Running install script " + install_script)
 84 |     self.install_script = install_script
 85 |     for line in install_script.split('\n'):
 86 |       self.run(line)
 87 | 
 88 |   def run(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs):
 89 | 
 90 |     if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'):
 91 |       # HACK
 92 |       if not util.is_bash_builtin(cmd) or True:
 93 |         return self._run_with_output_on_failure(cmd, non_blocking, ignore_errors, **_kwargs)
 94 |       else:
 95 |         self.log("Found bash built-in, using regular run")
 96 | 
 97 |     if not self._can_run:
 98 |       assert False, "Using .run before initialization finished"
 99 |     if '\n' in cmd:
100 |       cmds = cmd.split('\n')
101 |       self.log(
102 |         f"Running {len(cmds)} commands at once, returning status of last")
103 |       status = -1
104 |       for subcmd in cmds:
105 |         status = self.run(subcmd)
106 |       return status
107 | 
108 |     cmd = cmd.strip()
109 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
110 |       return -1
111 |     self.run_counter += 1
112 |     self.log("tmux> %s", cmd)
113 | 
114 |     self._cmd = cmd
115 |     self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd'
116 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
117 |     assert not os.path.exists(self._status_fn)
118 | 
119 |     cmd = util.shell_strip_comment(cmd)
120 |     # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
121 | 
122 |     self.write(self._cmd_fn, cmd + '\n')
123 |     modified_cmd = f'{cmd} ; echo $? > {self._status_fn}'
124 |     modified_cmd = shlex.quote(modified_cmd)
125 | 
126 |     tmux_window = self.tmux_session+':'+str(self.tmux_window_id)
127 |     tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter'
128 |     self._run_raw(tmux_cmd, ignore_errors=ignore_errors)
129 |     if non_blocking:
130 |       return 0
131 | 
132 |     if not self.wait_for_file(self._status_fn, max_wait_sec=60):
133 |       self.log(f"Retrying waiting for {self._status_fn}")
134 |     while not self.exists(self._status_fn):
135 |       self.log(f"Still waiting for {cmd}")
136 |       self.wait_for_file(self._status_fn, max_wait_sec=60)
137 |     contents = self.read(self._status_fn)
138 | 
139 |     # if empty wait a bit to allow for race condition
140 |     if len(contents) == 0:
141 |       time.sleep(0.01)
142 |     status = int(open(self._status_fn).read().strip())
143 |     self.last_status = status
144 | 
145 |     if status != 0:
146 |       if not ignore_errors:
147 |         raise RuntimeError(f"Command {cmd} returned status {status}")
148 |       else:
149 |         self.log(f"Warning: command {cmd} returned status {status}")
150 | 
151 |     return status
152 | 
153 |   def join(self, ignore_errors=False):
154 |     """Waits until last executed command completed."""
155 |     assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it"
156 |     check_interval = 0.2
157 |     status_fn = self._status_fn
158 |     if not self.wait_for_file(status_fn, max_wait_sec=30):
159 |       self.log(f"Retrying waiting for {status_fn}")
160 |     while not self.exists(status_fn):
161 |       self.log(f"Still waiting for {self._cmd}")
162 |       self.wait_for_file(status_fn, max_wait_sec=30)
163 |     contents = self.read(status_fn)
164 | 
165 |     # if empty wait a bit to allow for race condition
166 |     if len(contents) == 0:
167 |       time.sleep(check_interval)
168 |       contents = self.read(status_fn)
169 |     status = int(contents.strip())
170 |     self.last_status = status
171 | 
172 |     if status != 0:
173 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
174 |       if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'):
175 |         self.log(
176 |           f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
177 |         self.log(f"\n{'*'*80}\nEnd failing output")
178 |       if not ignore_errors:
179 |         raise RuntimeError(f"Command {self._cmd} returned status {status}")
180 |       else:
181 |         self.log(f"Warning: command {self._cmd} returned status {status}")
182 | 
183 |     return status
184 | 
185 |   def switch_window(self, window_id: int):
186 |     """
187 |     Switches currently active tmux window for given task. 0 is the default window
188 |     Args:
189 |       window_id: integer id of tmux window to use
190 |     """
191 | 
192 |     # windows are numbered sequentially 0, 1, 2, ...
193 |     # create any missing windows and make them point to the same directory
194 |     if window_id not in self.tmux_available_window_ids:
195 |       for i in range(max(self.tmux_available_window_ids)+1, window_id+1):
196 |         self._run_raw(f'tmux new-window -t {self.tmux_session} -d')
197 | 
198 |         tmux_window = self.tmux_session + ':' + str(i)
199 |         cmd = shlex.quote(f'cd {self.taskdir}')
200 |         tmux_cmd = f'tmux send-keys -t {tmux_window} {cmd} Enter'
201 |         self._run_raw(tmux_cmd)
202 |         self.tmux_available_window_ids.append(i)
203 | 
204 |     self.tmux_window_id = window_id
205 | 
206 |   # This is a future "run" command, will become "run" once all cases are checked
207 |   def _run_with_output_on_failure(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs) -> str:
208 |     if not self._can_run:
209 |       assert False, "Using .run before initialization finished"
210 |     if '\n' in cmd:
211 |       cmds = cmd.split('\n')
212 |       self.log(
213 |         f"Running {len(cmds)} commands at once, returning status of last")
214 |       status = -1
215 |       for subcmd in cmds:
216 |         status = self.run(subcmd)
217 |       return status
218 | 
219 |     cmd = cmd.strip()
220 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
221 |       return ''
222 |     self.run_counter += 1
223 |     self.log("tmux> %s", cmd)
224 | 
225 |     self._cmd = cmd
226 |     self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd'
227 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
228 |     self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out'
229 |     assert not os.path.exists(self._status_fn)
230 | 
231 |     cmd = util.shell_strip_comment(cmd)
232 |     # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
233 | 
234 |     self.write(self._cmd_fn, cmd + '\n')
235 |     #  modified_cmd = f'{cmd} ; echo $? > {self._status_fn}'
236 |     modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}'
237 |     modified_cmd = shlex.quote(modified_cmd)
238 | 
239 |     tmux_window = self.tmux_session+':'+str(self.tmux_window_id)
240 |     tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter'
241 |     self._run_raw(tmux_cmd)
242 |     if non_blocking:
243 |       return ''
244 | 
245 |     if not self.wait_for_file(self._status_fn, max_wait_sec=60):
246 |       self.log(f"Retrying waiting for {self._status_fn}")
247 |     while not self.exists(self._status_fn):
248 |       self.log(f"Still waiting for {cmd}")
249 |       self.wait_for_file(self._status_fn, max_wait_sec=60)
250 |     contents = self.read(self._status_fn)
251 | 
252 |     # if empty wait a bit to allow for race condition
253 |     if len(contents) == 0:
254 |       time.sleep(0.01)
255 |     status = int(open(self._status_fn).read().strip())
256 |     self.last_status = status
257 | 
258 |     if status != 0:
259 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
260 |       self.log(
261 |         f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
262 |       self.log(f"\n{'*'*80}\nEnd failing output")
263 |       if not ignore_errors:
264 |         raise RuntimeError(f"Command {cmd} returned status {status}")
265 |       else:
266 |         self.log(f"Warning: command {cmd} returned status {status}")
267 | 
268 |     return self.read(self._out_fn)
269 | 
270 |   def _run_raw(self, cmd, ignore_errors=False):
271 |     """Runs command directly, skipping tmux interface"""
272 |     # TODO: capture stdout/stderr for feature parity with aws_backend
273 |     result = os.system(cmd)
274 |     if result != 0:
275 |       if ignore_errors:
276 |         self.log(f"command ({cmd}) failed.")
277 |         assert False, "_run_raw failed"
278 | 
279 |   def upload(self, local_fn, remote_fn=None, dont_overwrite=False):
280 |     """Uploads file to remote instance. If location not specified, dumps it
281 |     into default directory. Creates missing directories in path name."""
282 | 
283 |     # support wildcard through glob
284 |     if '*' in local_fn:
285 |       for local_subfn in glob.glob(local_fn):
286 |         self.upload(local_subfn)
287 |       return
288 | 
289 |     if remote_fn is None:
290 |       remote_fn = os.path.basename(local_fn)
291 | 
292 |     if dont_overwrite and self.exists(remote_fn):
293 |       self.log("Remote file %s exists, skipping" % (remote_fn,))
294 |       return
295 | 
296 |     if not remote_fn.startswith('/'):
297 |       remote_fn = self.taskdir + '/' + remote_fn
298 | 
299 |     remote_fn = remote_fn.replace('~', self.homedir)
300 |     self.log('uploading ' + local_fn + ' to ' + remote_fn)
301 | 
302 |     local_fn = os.path.abspath(local_fn)
303 |     self._run_raw("cp -R %s %s" % (local_fn, remote_fn))
304 | 
305 |   def download(self, remote_fn, local_fn='.'):
306 |     if local_fn == '.':
307 |       local_fn = self._cwd
308 |     #    self.log("downloading %s to %s" % (remote_fn, local_fn))
309 |     if not remote_fn.startswith('/'):
310 |       remote_fn = self._cwd + '/' + remote_fn
311 |     if self.exists(remote_fn):
312 |       os.system(f'cp {remote_fn} {local_fn}')
313 |     else:
314 |       raise RuntimeError(f"No such file {remote_fn}")
315 | 
316 |   def exists(self, remote_fn):
317 |     return os.path.exists(remote_fn)
318 | 
319 |   def read(self, remote_fn):
320 |     tmp_fn = self.local_scratch + '/' + str(util.now_micros())
321 |     self.download(remote_fn, tmp_fn)
322 |     return open(tmp_fn).read()
323 | 
324 |   def write(self, remote_fn, contents):
325 |     def make_temp_fn():
326 |       """Returns temporary filename for this task."""
327 |       return self.local_scratch + '/write.' + str(util.now_micros())
328 | 
329 |     tmp_fn = make_temp_fn()
330 |     open(tmp_fn, 'w').write(contents)
331 |     self.upload(tmp_fn, remote_fn)
332 | 
333 |   # don't include file streaming for now
334 |   # the issue is that file streaming by default turns on 4K buffering, which makes
335 |   # streaming a lot less useful. Similar buffering is turned on for piping commands
336 |   # https://unix.stackexchange.com/questions/25372/turn-off-buffering-in-pipe
337 |   # def file_stream(self, fn: str) -> None:
338 |   #   #    if not fn.startswith('/'):
339 |   #   #      fn = self.taskdir + '/' + fn
340 |   #
341 |   #   if not os.path.exists(fn):
342 |   #     os.system('mkdir -p ' + os.path.dirname(os.path.abspath(fn)))
343 |   #     os.system('touch ' + fn)
344 |   #
345 |   #   p = subprocess.Popen(['tail', '-f', fn], stdout=subprocess.PIPE)
346 |   #
347 |   #   for line in iter(p.stdout.readline, ''):
348 |   #     sys.stdout.write(line.decode('ascii', errors='ignore'))
349 | 
350 |   @property
351 |   def logdir(self):
352 |     """Returns logging directory, creating one if necessary. See "Logdir" section  of design doc on naming convention."""
353 | 
354 |     run_name = ncluster_globals.get_run_for_task(self)
355 |     logdir = ncluster_globals.get_logdir(run_name)
356 |     if logdir:
357 |       return logdir
358 | 
359 |     # create logdir. Only single task in a group creates the logdir
360 |     if ncluster_globals.is_chief(self, run_name):
361 |       chief = self
362 |     else:
363 |       chief = ncluster_globals.get_chief(run_name)
364 | 
365 |     chief.setup_logdir()
366 |     return ncluster_globals.get_logdir(run_name)
367 |    # release lock
368 | 
369 |   def setup_logdir(self):
370 |     # todo: locking on logdir creation
371 | 
372 |     """Create logdir for task/job/run. No-op if the task is not chief (0'th task of 0'th job of run)
373 |     """
374 |     run_name = ncluster_globals.get_run_for_task(self)
375 |     self.log("Creating logdir for run "+run_name)
376 |     logdir_root = ncluster_globals.LOGDIR_ROOT
377 |     assert logdir_root
378 | 
379 |     self.run(f'mkdir -p {logdir_root}')
380 |     find_command = f'find {logdir_root} -maxdepth 1 -type d'
381 | 
382 |     stdout, stderr = self.run_with_output(find_command)
383 |     logdir = f"{logdir_root}/{run_name}"
384 | 
385 |     counter = 0
386 |     while logdir in stdout:
387 |       counter += 1
388 |       new_logdir = f'{logdir_root}/{run_name}.{counter:02d}'
389 |       self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}')
390 |       logdir = new_logdir
391 |     self.run(f'mkdir -p {logdir}')
392 | 
393 |     ncluster_globals.set_logdir(run_name, logdir)
394 |     return logdir
395 | 
396 | 
397 | class Job(backend.Job):
398 |   pass
399 | 
400 | 
401 | class Run:
402 |   """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter
403 |   server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and
404 |   event files.
405 |   :ivar aws_placement_group_name: somedoc
406 |   """
407 |   jobs: List[Job]
408 | 
409 |   def __init__(self, name='', **kwargs):
410 |     """Creates a run. If install_script is specified, it's used as default
411 |     install_script for all jobs (can be overridden by Job constructor)"""
412 | 
413 |     self.name = name
414 |     self.kwargs = kwargs
415 | 
416 |   @property
417 |   def logdir(self):
418 |     chief_task = ncluster_globals.get_chief(self.name)
419 |     return chief_task.logdir
420 | 
421 |   # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods
422 |   def run(self, *args, **kwargs):
423 |     """Runs command on every job in the run."""
424 | 
425 |     for job in self.jobs:
426 |       job.run(*args, **kwargs)
427 | 
428 |   def run_with_output(self, *args, **kwargs):
429 |     """Runs command on every first job in the run, returns stdout."""
430 |     for job in self.jobs:
431 |       job.run_with_output(*args, **kwargs)
432 | 
433 |   def _run_raw(self, *args, **kwargs):
434 |     """_run_raw on every job in the run."""
435 |     for job in self.jobs:
436 |       job._run_raw(*args, **kwargs)
437 | 
438 |   def upload(self, *args, **kwargs):
439 |     """Runs command on every job in the run."""
440 |     for job in self.jobs:
441 |       job.upload(*args, **kwargs)
442 | 
443 |   def make_job(self, name='', **kwargs):
444 |     return make_job(name+'.'+self.name, run_name=self.name, **kwargs)
445 | 
446 | 
447 | def make_task(name='',
448 |               run_name='',
449 |               **kwargs) -> Task:
450 |   """Create task, also create dummy run if not specified."""
451 |   ncluster_globals.task_launched = True
452 | 
453 |   name = ncluster_globals.auto_assign_task_name_if_needed(name)
454 | 
455 |   # tmux can't use . for session names
456 |   tmux_session = name.replace('.', '=')
457 |   tmux_window_id = 0
458 |   util.log(f'killing session {tmux_session}')
459 | 
460 |   if not util.is_set("NCLUSTER_NOKILL_TMUX"):
461 |     os.system(f'tmux kill-session -t {tmux_session}')
462 |   os.system(f'tmux new-session -s {tmux_session} -n {tmux_window_id} -d')
463 | 
464 |   task = Task(name,
465 |               tmux_session=tmux_session,  # propagate optional args
466 |               run_name=run_name,
467 |               **kwargs)
468 |   ncluster_globals.register_task(task, run_name)
469 |   return task
470 | 
471 | 
472 | def make_job(name="",
473 |              num_tasks=1,
474 |              run_name="",
475 |              install_script='',
476 |              **kwargs
477 |              ) -> backend.Job:
478 |   assert num_tasks > 0, f"Can't create job with {num_tasks} tasks"
479 | 
480 |   name = ncluster_globals.auto_assign_job_name_if_needed(name)
481 |   util.validate_ncluster_job_name(name)
482 |   tasks = [make_task(f"{i}.{name}",
483 |                      run_name=run_name,
484 |                      install_script=install_script,
485 |                      **kwargs
486 |                      ) for i in range(num_tasks)]
487 | 
488 |   job = backend.Job(name=name, tasks=tasks, **kwargs)
489 |   return job
490 | 
491 | 
492 | def make_run(name) -> Run:
493 |   run = Run(name)
494 |   return run
495 | 


--------------------------------------------------------------------------------
/ncluster/ncluster.py:
--------------------------------------------------------------------------------
  1 | from . import aws_backend
  2 | from . import local_backend
  3 | from . import backend
  4 | from . import aws_util as u
  5 | import collections
  6 | 
  7 | from . import ncluster_globals
  8 | 
  9 | _backend: type(backend) = backend
 10 | 
 11 | 
 12 | def get_logdir_root() -> str:
 13 |   return _backend.LOGDIR_ROOT
 14 | 
 15 | 
 16 | def set_logdir_root(logdir_root):
 17 |   """Globally changes logdir root for all runs."""
 18 |   _backend.LOGDIR_ROOT = logdir_root
 19 | 
 20 | 
 21 | def set_backend(backend_name: str):
 22 |   """Sets backend (local or aws)"""
 23 |   global _backend, _backend_name
 24 |   _backend_name = backend_name
 25 | 
 26 |   assert not ncluster_globals.task_launched, "Not allowed to change backend after launching a task (this pattern is error-prone)"
 27 |   if backend_name == 'aws':
 28 |     _backend = aws_backend
 29 |   elif backend_name == 'local':
 30 |     _backend = local_backend
 31 |   else:
 32 |     assert False, f"Unknown backend {backend_name}"
 33 |   ncluster_globals.LOGDIR_ROOT = _backend.LOGDIR_ROOT
 34 | 
 35 | 
 36 | def use_aws():
 37 |   set_backend('aws')
 38 | 
 39 | 
 40 | def use_local():
 41 |   set_backend('local')
 42 | 
 43 | 
 44 | def get_backend() -> str:
 45 |   """Returns backend name, ie "local" or "aws" """
 46 |   return _backend_name
 47 | 
 48 | 
 49 | def get_backend_module() -> backend:
 50 |   return _backend
 51 | 
 52 | 
 53 | def running_locally():
 54 |   return get_backend() == 'local'
 55 | 
 56 | 
 57 | def get_region() -> str:
 58 |   if _backend != local_backend:
 59 |     return u.get_region()
 60 |   else:
 61 |     return 'local'
 62 | 
 63 | 
 64 | def get_zone() -> str:
 65 |   if _backend != local_backend:
 66 |     return u.get_zone()
 67 |   else:
 68 |     return 'local'
 69 | 
 70 | 
 71 | #  def make_run(name='', **kwargs):
 72 | #  return _backend.Run(name, **kwargs)
 73 | 
 74 | 
 75 | # Use factory methods task=create_task instead of relying solely on constructors task=Task() because underlying hardware resources may be reused between instantiations
 76 | # For instance, one may create a Task initialized with an instance that was previous created for this kind of task
 77 | # Factory method will make the decision to recreate or reuse such resource, and wrap this resource with a Task object.
 78 | def make_task(name: str = '',
 79 |               run_name: str = '',
 80 |               install_script: str = '',
 81 |               **kwargs) -> backend.Task:
 82 |   return _backend.make_task(name=name, run_name=run_name,
 83 |                             install_script=install_script, **kwargs)
 84 | 
 85 | 
 86 | def make_job(name: str = '',
 87 |              run_name: str = '',
 88 |              num_tasks: int = 0,
 89 |              install_script: str = '',
 90 |              **kwargs
 91 |              ) -> backend.Job:
 92 |   """
 93 |   Create a job using current backend. Blocks until all tasks are up and initialized.
 94 | 
 95 |   Args:
 96 |     name: name of the job
 97 |     run_name: name of the run (auto-assigned if empty)
 98 |     num_tasks: number of tasks
 99 |     install_script: bash-runnable script
100 |     **kwargs:
101 | 
102 |   Returns:
103 |     backend.Job
104 |   """
105 |   return _backend.make_job(name=name, run_name=run_name, num_tasks=num_tasks,
106 |                            install_script=install_script, **kwargs)
107 | 
108 | 
109 | def make_run(name: str = '', **kwargs) -> backend.Run:
110 |   return _backend.make_run(name=name, **kwargs)
111 | 
112 | 
113 | # TODO: remove?
114 | def join(things_to_join):
115 |   if isinstance(things_to_join, collections.Iterable):
116 |     for thing in things_to_join:
117 |       thing.join()
118 |   else:
119 |     things_to_join.join()
120 | 


--------------------------------------------------------------------------------
/ncluster/ncluster_globals.py:
--------------------------------------------------------------------------------
  1 | """Module that keeps global state of ncluster tasks, such as naming,
  2 | connection of tasks to runs
  3 | 
  4 | run refers to string name
  5 | run_object refers to Run object corresponding to that name
  6 | 
  7 | """
  8 | import os
  9 | import sys
 10 | from typing import Dict, Any, List
 11 | from . import util
 12 | from . import backend
 13 | 
 14 | LOGDIR_ROOT = None
 15 | task_launched = False  # keep track whether anything has been launched
 16 | 
 17 | task_counter = 0
 18 | job_counter = 0
 19 | run_counter = 0
 20 | 
 21 | run_dict: Dict[str, Any] = {}
 22 | task_run_dict: Dict[backend.Task, str] = {}
 23 | run_task_dict: Dict[str, backend.Task] = {}
 24 | run_logdir_dict: Dict[str, str] = {}
 25 | 
 26 | tasks_seen: List[backend.Task] = []  # list of all tasks created
 27 | 
 28 | 
 29 | def auto_assign_task_name_if_needed(name, instance_type='', image_name='',
 30 |                                     tasks=1):
 31 |   global task_counter
 32 |   if name:
 33 |     return name
 34 | 
 35 |   main_script = os.path.abspath(sys.argv[0])
 36 |   script_id = util.alphanumeric_hash(
 37 |     f"{main_script}-{instance_type}-{image_name}-{tasks}")
 38 |   name = f"unnamedtask-{task_counter}-{script_id}"
 39 |   task_counter += 1
 40 |   return name
 41 | 
 42 | 
 43 | def auto_assign_job_name_if_needed(name):
 44 |   global job_counter
 45 |   if name:
 46 |     return name
 47 |   script_id = util.alphanumeric_hash(sys.argv[0])
 48 |   name = f"unnamedjob-{job_counter}-{script_id}"
 49 |   job_counter += 1
 50 |   return name
 51 | 
 52 | 
 53 | def auto_assign_run_name_if_needed(name):
 54 |   global run_counter
 55 |   if name:
 56 |     return name
 57 |   script_id = util.alphanumeric_hash(sys.argv[0])
 58 |   name = f"unnamedrun-{run_counter}-{script_id}"
 59 |   run_counter += 1
 60 |   return name
 61 | 
 62 | 
 63 | # def add_job_to_run(job, run_name):
 64 | #   global run_dict, job_run_dict
 65 | #   return job_run_dict.get(job, '')
 66 | #
 67 | 
 68 | # def register_run(name: str, run):
 69 | #   global run_dict, placement_dict
 70 | #   run_dict[name] = run
 71 | #
 72 | 
 73 | 
 74 | def register_task(task: Any, run_name: str):
 75 |   global task_run_dict, run_task_dict, tasks_seen
 76 |   assert task.name not in tasks_seen
 77 |   tasks_seen.append(task.name)
 78 |   task_run_dict[task] = run_name
 79 |   run_task_dict.setdefault(run_name, []).append(task)
 80 | 
 81 | 
 82 | def register_run(run: backend.Run, run_name) -> None:
 83 |   assert run_name not in run_dict
 84 |   assert run_name  # empty name reserved to mean no run
 85 |   run_dict[run_name] = run
 86 | 
 87 | 
 88 | def is_chief(task: backend.Task, run_name: str):
 89 |   """Returns True if task is chief task in the corresponding run"""
 90 |   global run_task_dict
 91 |   if run_name not in run_task_dict:
 92 |     return True
 93 |   task_list = run_task_dict[run_name]
 94 |   assert task in task_list, f"Task {task.name} doesn't belong to run {run_name}"
 95 |   return task_list[0] == task
 96 | 
 97 | 
 98 | def get_chief(run_name: str):
 99 |   assert run_name in run_task_dict, f"Run {run_name} doesn't exist"
100 |   tasks = run_task_dict[run_name]
101 |   assert tasks, f"Run {run_name} had tasks {tasks}, expected non-empty list"
102 |   return tasks[0]
103 | 
104 | 
105 | def get_logdir(run_name: str):
106 |   """Returns logdir for this run. It is the job of logdir creator to set logdir for this run"""
107 | 
108 |   if not run_name:
109 |     return '/tmp'
110 |   return run_logdir_dict.get(run_name, '')
111 | 
112 | 
113 | def get_run_object(run_name: str) -> backend.Run:
114 |   return run_dict.get(run_name, None)
115 | 
116 | 
117 | def set_logdir(run_name, logdir):
118 |   assert run_name not in run_logdir_dict, f"logdir for run {run_name} has already been set to {run_logdir_dict[run_name]}, trying to change it to {logdir} is illegal"
119 |   run_logdir_dict[run_name] = logdir
120 | 
121 | 
122 | def get_run_for_task(task: backend.Task) -> str:
123 |   """Gets run name associated with given Task"""
124 |   return task_run_dict.get(task, '')
125 | 
126 | 
127 | def get_run_object(run_name: str) -> backend.Run:
128 |   return run_dict.get(run_name, None)
129 | 
130 | 
131 | def create_run_if_needed(run_name, run_creation_callback) -> backend.Run:
132 |   if run_name in run_dict:
133 |     return run_dict[run_name]
134 |   run = run_creation_callback(run_name)
135 |   return run
136 | 


--------------------------------------------------------------------------------
/ncluster/summary.txt:
--------------------------------------------------------------------------------
1 | tf_two_machines -- 500 on t3, 910 on c3
2 | 
3 | 


--------------------------------------------------------------------------------
/ncluster/test.py:
--------------------------------------------------------------------------------
1 | 
2 | print("%20s" % ('asdfasdf',))
3 | print(f"{'asdfasdf':>20}")
4 | 
5 | print("%5.2f" % (5.5,))
6 | print(f"{5.5:5.2f}")
7 | 


--------------------------------------------------------------------------------
/ncluster/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various helper utilities used internally by ncluster project, but may be potentially
  3 | useful outside of the cluster project.
  4 | """
  5 | 
  6 | import os
  7 | import random
  8 | import string
  9 | import sys
 10 | import time
 11 | from collections import Iterable
 12 | import shlex
 13 | 
 14 | # starting value for now_micros (Aug 31, 2018)
 15 | # using this to make various timestamped names shorter
 16 | EPOCH_MICROS = 1535753974788163
 17 | 
 18 | 
 19 | def is_iterable(k):
 20 |   return isinstance(k, Iterable)
 21 | 
 22 | 
 23 | def now_micros(absolute=False) -> int:
 24 |   """Return current micros since epoch as integer."""
 25 |   micros = int(time.time() * 1e6)
 26 |   if absolute:
 27 |     return micros
 28 |   return micros - EPOCH_MICROS
 29 | 
 30 | 
 31 | def now_millis(absolute=False) -> int:
 32 |   """Return current millis since epoch as integer."""
 33 |   millis = int(time.time() * 1e3)
 34 |   if absolute:
 35 |     return millis
 36 |   return millis - EPOCH_MICROS // 1000
 37 | 
 38 | 
 39 | def current_timestamp() -> str:
 40 |   # timestamp format from https://github.com/tensorflow/tensorflow/blob/155b45698a40a12d4fef4701275ecce07c3bb01a/tensorflow/core/platform/default/logging.cc#L80
 41 |   current_seconds = time.time()
 42 |   remainder_micros = int(1e6 * (current_seconds - int(current_seconds)))
 43 |   time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_seconds))
 44 |   full_time_str = "%s.%06d" % (time_str, remainder_micros)
 45 |   return full_time_str
 46 | 
 47 | 
 48 | def log_error(*args, **kwargs):
 49 |   print(f"Error encountered {args} {kwargs}")
 50 | 
 51 | 
 52 | def log(*args, **kwargs):
 53 |   print(f"{args} {kwargs}")
 54 | 
 55 | 
 56 | def install_pdb_handler():
 57 |   """Make CTRL+\ break into gdb."""
 58 | 
 59 |   import signal
 60 |   import pdb
 61 | 
 62 |   def handler(_signum, _frame):
 63 |     pdb.set_trace()
 64 | 
 65 |   signal.signal(signal.SIGQUIT, handler)
 66 | 
 67 | 
 68 | def shell_add_echo(script):
 69 |   """Goes over each line script, adds "echo cmd" in front of each cmd.
 70 | 
 71 |   ls a
 72 | 
 73 |   becomes
 74 | 
 75 |   echo * ls a
 76 |   ls a
 77 |   """
 78 |   new_script = ""
 79 |   for cmd in script.split('\n'):
 80 |     cmd = cmd.strip()
 81 |     if not cmd:
 82 |       continue
 83 |     new_script += "echo \\* " + shlex.quote(cmd) + "\n"
 84 |     new_script += cmd + "\n"
 85 |   return new_script
 86 | 
 87 | 
 88 | def shell_strip_comment(cmd):
 89 |   """ hi # testing => hi"""
 90 |   if '#' in cmd:
 91 |     return cmd.split('#', 1)[0]
 92 |   else:
 93 |     return cmd
 94 | 
 95 | 
 96 | def random_id(k=5):
 97 |   """Random id to use for AWS identifiers."""
 98 |   #  https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python
 99 |   return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k))
100 | 
101 | 
102 | def alphanumeric_hash(s: str, size=5):
103 |   """Short alphanumeric string derived from hash of given string"""
104 |   import hashlib
105 |   import base64
106 |   hash_object = hashlib.md5(s.encode('ascii'))
107 |   s = base64.b32encode(hash_object.digest())
108 |   result = s[:size].decode('ascii').lower()
109 |   return result
110 | 
111 | 
112 | def reverse_taskname(name: str) -> str:
113 |   """
114 |   Reverses components in the name of task. Reversed convention is used for filenames since
115 |   it groups log/scratch files of related tasks together
116 | 
117 |   0.somejob.somerun -> somerun.somejob.0
118 |   0.somejob -> somejob.0
119 |   somename -> somename
120 | 
121 |   Args:
122 |     name: name of task
123 | 
124 |   """
125 |   components = name.split('.')
126 |   assert len(components) <= 3
127 |   return '.'.join(components[::-1])
128 | 
129 | 
130 | def is_bash_builtin(cmd):
131 |   """Return true if command is invoking bash built-in
132 |   """
133 |   # from compgen -b
134 |   bash_builtins = ['alias', 'bg', 'bind', 'alias', 'bg', 'bind', 'break',
135 |                    'builtin', 'caller', 'cd', 'command', 'compgen', 'complete',
136 |                    'compopt', 'continue', 'declare', 'dirs', 'disown', 'echo',
137 |                    'enable', 'eval', 'exec', 'exit', 'export', 'false', 'fc',
138 |                    'fg', 'getopts', 'hash', 'help', 'history', 'jobs', 'kill',
139 |                    'let', 'local', 'logout', 'mapfile', 'popd', 'printf',
140 |                    'pushd', 'pwd', 'read', 'readarray', 'readonly', 'return',
141 |                    'set', 'shift', 'shopt', 'source', 'suspend', 'test',
142 |                    'times', 'trap', 'true', 'type', 'typeset', 'ulimit',
143 |                    'umask', 'unalias', 'unset', 'wait']
144 |   toks = cmd.split()
145 |   if toks and toks[0] in bash_builtins:
146 |     return True
147 |   return False
148 | 
149 | 
150 | def is_set(name):
151 |   """Helper method to check if given property is set"""
152 |   val = os.environ.get(name, '0')
153 |   assert val == '0' or val == '1', f"env var {name} has value {val}, expected 0 or 1"
154 |   return val == '1'
155 | 
156 | 
157 | def assert_script_in_current_directory():
158 |   """Assert fail if current directory is different from location of the script"""
159 | 
160 |   script = sys.argv[0]
161 |   assert os.path.abspath(os.path.dirname(script)) == os.path.abspath(
162 |     '.'), f"Change into directory of script {script} and run again."
163 | 
164 | 
165 | def validate_ncluster_job_name(name):
166 |   assert name.count(
167 |     '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for  convention)"
168 | 
169 | 
170 | def toseconds(dt):
171 |   """Converts datetime object to seconds."""
172 |   return time.mktime(dt.utctimetuple())
173 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | wrapt
 2 | ray
 3 | numpy
 4 | torch
 5 | tensorflow
 6 | boto3
 7 | paramiko
 8 | portpicker
 9 | tzlocal
10 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = ncluster
 3 | version = 0.1.20
 4 | author = Yaroslav Bulatov, Andrew Shaw
 5 | author_email = yaroslavvb@gmail.com
 6 | description= Lightweight interface to launching jobs in the cloud
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | license_file = LICENSE
10 | url = https://github.com/diux-dev/ncluster
11 | classifiers =
12 |   Programming Language :: Python :: 3
13 |   License :: OSI Approved :: MIT License
14 |   Operating System :: OS Independent
15 | 
16 | [options]
17 | python_requires = >= 3.6
18 | setup_requires =
19 |   setuptools >= 38.6
20 |   pip >= 10
21 |   twine >= 1.11
22 | packages = find:
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | setup(scripts=['ncluster/aws_create_resources.py',
3 |                'ncluster/aws_delete_resources.py'])
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/join_test.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import pytest
 3 | 
 4 | def test():
 5 |   task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE)
 6 |   task.run("mkdir /illegal", non_blocking=True)
 7 |   task.join(ignore_errors=True)  # this succeed/print error message
 8 | 
 9 |   task.run("mkdir /illegal", non_blocking=True)
10 |   with pytest.raises(RuntimeError):
11 |     task.join()  # this should fail
12 | 
13 | if __name__ == '__main__':
14 |   test()
15 | 


--------------------------------------------------------------------------------
/tests/logdir_test.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # tests to make sure that logdir logic works
 3 | import inspect
 4 | import random
 5 | import sys
 6 | import threading
 7 | 
 8 | import ncluster
 9 | 
10 | 
11 | def test_two_jobs():
12 |   run = ncluster.make_run('logdir_test')
13 |   job1 = run.make_job('job1')
14 |   task1 = job1.tasks[0]
15 |   task1.run(f'echo hello > {task1.logdir}/message')
16 |   job2 = run.make_job('job2')
17 |   task2 = job2.tasks[0]
18 |   assert task2.read(f'{task2.logdir}/message').strip() == 'hello'
19 | 
20 | 
21 | def test_multiple_logdirs():
22 |   logdir1 = ncluster.get_logdir_root() + '/test1'
23 |   dummy_task = ncluster.make_task()
24 |   dummy_task.run(f'rm -Rf {logdir1}')
25 |   task1 = ncluster.make_task(run_name='test1')
26 |   assert task1.logdir == logdir1
27 | 
28 |   logdir2 = ncluster.get_logdir_root() + '/test2'
29 |   task2 = ncluster.make_task(run_name='test2')
30 |   dummy_task.run(f'rm -Rf {logdir2}*')
31 |   dummy_task.run(f'mkdir {logdir2}')
32 |   assert task2.logdir == logdir2 + '.01'
33 | 
34 | 
35 | def test_multiple_logdir_tasks():
36 |   n = 10
37 |   dummy_task = ncluster.make_task()
38 |   logdir1 = ncluster.get_logdir_root() + '/test1'
39 |   dummy_task.run(f'rm -Rf {logdir1}')
40 |   job = ncluster.make_job(run_name='test1', num_tasks=n)
41 | 
42 |   obtained_logdirs = []
43 | 
44 |   import wrapt
45 | 
46 |   @wrapt.synchronized
47 |   def query(i):
48 |     obtained_logdirs.append(job.tasks[i].logdir)
49 | 
50 |   threads = [threading.Thread(target=query, args=(i,)) for i in range(n)]
51 |   for thread in reversed(threads):
52 |     thread.start()
53 | 
54 |   random.shuffle(threads)
55 |   for thread in threads:
56 |     thread.join()
57 | 
58 |   assert len(set(obtained_logdirs)) == 1
59 |   assert obtained_logdirs[0] == logdir1
60 | 
61 | 
62 | def run_all_tests(module):
63 |   all_functions = inspect.getmembers(module, inspect.isfunction)
64 |   for name, func in all_functions:
65 |     if name.startswith('test'):
66 |       print("Testing " + name)
67 |       func()
68 |   print(module.__name__ + " tests passed.")
69 | 
70 | 
71 | def manual():
72 |   run_all_tests(sys.modules[__name__])
73 | 
74 | 
75 | if __name__ == '__main__':
76 |   manual()
77 | 


--------------------------------------------------------------------------------
/tests/run_test.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | 
 3 | def test():
 4 |   run = ncluster.make_run('run_test')
 5 |   job1 = run.make_job('job1')
 6 |   task1 = job1.tasks[0]
 7 |   assert task1.name == '0.job1.run_test'
 8 |   task1.run(f'echo task1sayshello > {task1.logdir}/message')
 9 |   job2 = run.make_job('job2')
10 |   task2 = job2.tasks[0]
11 |   assert task2.name == '0.job2.run_test'
12 |   assert task2.read(f'{task2.logdir}/message').strip() == 'task1sayshello'
13 |   
14 | 
15 | if __name__ == '__main__':
16 |   test()
17 | 


--------------------------------------------------------------------------------