├── .circleci
    └── config.yml
├── .deepsource.toml
├── .doctrees
    ├── .eggs
    │   ├── requests-2.19.1-py3.6.egg
    │   │   └── EGG-INFO
    │   │   │   └── DESCRIPTION.doctree
    │   └── urllib3-1.23-py3.6.egg
    │   │   └── EGG-INFO
    │   │       └── DESCRIPTION.doctree
    ├── environment.pickle
    └── index.doctree
├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
    ├── README.md
    ├── mpi_two_machines.py
    ├── pytorch_two_machines.py
    ├── ray_ps.py
    ├── ray_two_machines.py
    ├── ray_two_machines_local.py
    ├── requirements.txt
    ├── summary.txt
    ├── tf_two_machines.py
    ├── tf_two_machines_local.py
    └── util.py
├── examples
    ├── deleteme.py
    ├── gpubox.py
    ├── gpubox_jupyter_notebook_config.py
    ├── gpubox_sample.ipynb
    ├── launch_16_instances.py
    ├── ray_example.py
    ├── requirements.txt
    ├── simple_job.py
    ├── simple_task.py
    ├── simple_tf.py
    ├── tf_adder.py
    └── tf_adder_tb.py
├── ncluster
    ├── __init__.py
    ├── _version.py
    ├── aws_backend.py
    ├── aws_util.py
    ├── local_backend.py
    ├── ncluster_cloud_setup.py
    ├── ncluster_cloud_wipe.py
    ├── ncluster_globals.py
    ├── old_backend.py
    ├── summary.txt
    ├── test.py
    └── util.py
├── requirements.txt
├── requirements_benchmarks.txt
├── requirements_test.txt
├── setup.cfg
├── setup.py
├── tests
    ├── integration_test.py
    ├── join_test.py
    ├── logdir_test.py
    ├── many_commands_test.py
    └── run_test.py
└── tools
    ├── ncluster
    └── nsync


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 |       - image: circleci/python:3.6.1
12 | 
13 |       # Specify service dependencies here if necessary
14 |       # CircleCI maintains a library of pre-built images
15 |       # documented at https://circleci.com/docs/2.0/circleci-images/
16 |       # - image: circleci/postgres:9.4
17 | 
18 |     working_directory: ~/repo
19 | 
20 |     steps:
21 |       - checkout
22 | 
23 |       # Download and cache dependencies
24 |       - restore_cache:
25 |           keys:
26 |             - v1-dependencies-{{ checksum "requirements.txt" }}
27 |             # fallback to using the latest cache if no exact match is found
28 |             - v1-dependencies-
29 | 
30 |       - run:
31 |           name: install dependencies
32 |           command: |
33 |             python3 -m venv venv
34 |             . venv/bin/activate
35 |             pip install -r requirements.txt
36 |             pip install -U ncluster
37 | 
38 |       - save_cache:
39 |           paths:
40 |             - ./venv
41 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
42 | 
43 |       # run tests!
44 |       # this example uses Django's built-in test-runner
45 |       # other common Python testing frameworks include pytest and nose
46 |       # https://pytest.org
47 |       # https://nose.readthedocs.io
48 |       - run:
49 |           name: run tests
50 |           command: |
51 |             . venv/bin/activate
52 |             echo 'hello'
53 |             python tests/integration_test.py
54 | 
55 |       - store_artifacts:
56 |           path: test-reports
57 |           destination: test-reports
58 | 


--------------------------------------------------------------------------------
/.deepsource.toml:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | 
 3 | test_patterns = [
 4 |   
 5 | ]
 6 | 
 7 | exclude_patterns = [
 8 |   
 9 | ]
10 | 
11 | [[analyzers]]
12 | name = 'python'
13 | enabled = true
14 | runtime_version = '3.x.x'
15 | 


--------------------------------------------------------------------------------
/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/.eggs/requests-2.19.1-py3.6.egg/EGG-INFO/DESCRIPTION.doctree


--------------------------------------------------------------------------------
/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/.eggs/urllib3-1.23-py3.6.egg/EGG-INFO/DESCRIPTION.doctree


--------------------------------------------------------------------------------
/.doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/environment.pickle


--------------------------------------------------------------------------------
/.doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cybertronai/ncluster/9c2a7fb9677dba9afe48c94f35bde7c41e4cc75f/.doctrees/index.doctree


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /dist
 2 | /build
 3 | /.DS_Store
 4 | /ncluster.egg-info
 5 | /ncluster/__pycache__
 6 | /.eggs
 7 | /ncluster/.idea
 8 | /.idea
 9 | __pycache__
10 | /.pytest_cache
11 | *.py#
12 | /ncluster/.DS_Store
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2018] [Yaroslav Bulatov]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ncluster
  2 | By Yaroslav Bulatov, Andrew Shaw, Ben Mann
  3 | https://github.com/cybertronai/ncluster
  4 | 
  5 | Ncluster provides Python API to do the following things:
  6 | - Allocate AWS machine
  7 | - Upload file to machine
  8 | - Run command on machine
  9 | - Download file from machine
 10 | 
 11 | IE
 12 | 
 13 | ```
 14 | import ncluster
 15 | task = ncluster.make_task(instance_type='p2.xlarge')
 16 | task.upload('myscript.py')
 17 | task.run('python myscript.py > out')
 18 | task.download('out')
 19 | ```
 20 | 
 21 | Necessary AWS infrastructure is created on demand using defaults optimal for fast prototyping. IE, your machines are preconfigured for passwordless SSH, can access each other over all interfaces, and have a persistent file system mounted under /ncluster. Commands are executed in a remote tmux session so you can take over the environment at any time and continue from your terminal.  
 22 | 
 23 | 
 24 | ## Installation
 25 | Install pip, tmux, Python 3.6 (see below), and [write down](https://docs.google.com/document/d/1Z8lCZVWXs7XORbiNmBAsBDtouV3KwrtH8-UL5M-zHus/edit) your AWS security keys, then
 26 | 
 27 | ```
 28 | pip install -r https://raw.githubusercontent.com/yaroslavvb/ncluster/master/requirements.txt
 29 | pip install -U ncluster # `pip install -e .` to install from a local clone
 30 | export AWS_ACCESS_KEY_ID=AKIAIBATdf343
 31 | export AWS_SECRET_ACCESS_KEY=z7yKEP/RhO3Olk343aiP
 32 | export AWS_DEFAULT_REGION=us-east-1
 33 | ```
 34 | 
 35 | 
 36 | 
 37 | ## Command-line tools
 38 | 
 39 | ```
 40 | ncluster
 41 | ncluster ls
 42 | 
 43 | # bring up machine t2.nano with default AMI
 44 | ncluster launch --name testtest --instance_type t2.nano
 45 | 
 46 | # kill the machine
 47 | ncluster kill testtest
 48 | 
 49 | # list machinens
 50 | ncluster ls
 51 | ncluster ls <substring>
 52 | 
 53 | 
 54 | ncluster ssh # connects to latest instance
 55 | ncluster ssh <substring>  # connects to latest instance containing <substring>
 56 | ncluster ssh \'<exact match>\'
 57 | ncluster mosh <substring> 
 58 | ncluster kill <substring>    # terminates matching instances
 59 | ncluster kill \'<exact match>\'
 60 | ncluster stop <substring>    # stops matching instances
 61 | ncluster start <substring>   # starts matching stopped instances
 62 | ncluster nano       # starts a tiny instance
 63 | ncluster keys   # information on enabling SSH access for your team-members
 64 | 
 65 | ncluster ssh_    # like ssh but works on dumb terminals
 66 | ncluster ls     
 67 | ncluster cat <fn>
 68 | ncluster cmd "some command to run remotely on AWS"
 69 | 
 70 | ncluster efs   # gives EFS info such as the mount command
 71 | 
 72 | nsync -m gpubox
 73 | nsync -m gpubox -d transformer-xl
 74 | 
 75 | nsync -d {target directory} -m {machine name substring}
 76 | 
 77 | nsync -m gpubox # syncs . to ~ on gpubox
 78 | nsync -d transformer-xl -m 4gpubox  # syncs . to ~/transformer-xl on 4gpubox
 79 | 
 80 | ncluster hosts
 81 | 
 82 | 
 83 | {substring} selects the most recently launched instances whose name contains the substring. Empty string is a valid substring. Skipping -t will sync to ~ on remote machine. Sync seems to be 1 way (from local -> remote)
 84 | ```
 85 | 
 86 | ## Docs
 87 | - Some out-of-date docs with more info [docs](https://docs.google.com/document/d/178ITRCAkboHoOEZFnz9XvOsc8lXik6Acz_DS_V1u8hY/edit?usp=sharing)
 88 | 
 89 | ### Extra
 90 | An example of installing pip/tmux/python 3.6 on MacOS
 91 | 
 92 | 1. Download Anaconda distribution following https://conda.io/docs/user-guide/install/index.html
 93 | 2. Install tmux through homebrew: https://brew.sh/, then `brew install tmux`
 94 | 
 95 | Then
 96 | 
 97 | ```
 98 | conda create -n new python=3.6 -y
 99 | conda activate new
100 | ```
101 | 
102 | Extra Deps:
103 | ```
104 | brew install fswatch
105 | ```
106 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | Benchmarks
 2 | 
 3 | ```
 4 | pip install -r https://raw.githubusercontent.com/diux-dev/ncluster/master/requirements.txt
 5 | pip install ncluster
 6 | python <somebenchmark.py>
 7 | ```
 8 | 
 9 | 
10 | # Debugging
11 | ```
12 | export NCLUSTER_INSTANCE=c5.18xlarge
13 | export NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE=1
14 | ```
15 | 


--------------------------------------------------------------------------------
/benchmarks/mpi_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Running locally
  5 | 
  6 | 004/11 sent 100 MBs in 28.4 ms: 3519.33 MB/second
  7 | 005/11 sent 100 MBs in 25.1 ms: 3988.50 MB/second
  8 | 006/11 sent 100 MBs in 25.5 ms: 3918.33 MB/second
  9 | 007/11 sent 100 MBs in 25.3 ms: 3958.61 MB/second
 10 | 008/11 sent 100 MBs in 25.3 ms: 3954.15 MB/second
 11 | 009/11 sent 100 MBs in 24.9 ms: 4009.78 MB/second
 12 | 010/11 sent 100 MBs in 25.0 ms: 3992.75 MB/second
 13 | min:    24.94, median:    25.52, mean:    29.53
 14 | 
 15 | 
 16 | """
 17 | 
 18 | import argparse
 19 | import json
 20 | import os
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | import time
 24 | 
 25 | import util
 26 | 
 27 | parser = argparse.ArgumentParser()
 28 | parser.add_argument("--iters", default=11, type=int,
 29 |                     help="Maximum number of additions")
 30 | parser.add_argument("--size-mb", default=100, type=int,
 31 |                     help="size of vector in MBs")
 32 | parser.add_argument("--shards", default=1, type=int,
 33 |                     help="how many ways to shard the variable")
 34 | parser.add_argument('--image',
 35 |                     default='Deep Learning AMI (Ubuntu) Version 22.0')
 36 | parser.add_argument('--instance_type', type=str, default='')
 37 | parser.add_argument('--name',
 38 |                     default='mpi')
 39 | 
 40 | # internal flags
 41 | parser.add_argument('--role', default='launcher', type=str)
 42 | args = parser.parse_args()
 43 | 
 44 | 
 45 | def run_launcher():
 46 |   import ncluster
 47 | 
 48 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image,
 49 |                           instance_type=args.instance_type)
 50 |   job.upload(__file__)
 51 |   job.upload('util.py')
 52 | 
 53 |   # kill python just for when tmux session reuse is on
 54 |   if not ncluster.running_locally():
 55 |     job._run_raw('killall python', ignore_errors=True)
 56 | 
 57 |   if ncluster.get_backend() == 'aws':
 58 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 59 |     # TODO(y) switch to PyTorch enabled
 60 |     job.run('source activate tensorflow_p36')
 61 |     
 62 | 
 63 | 
 64 |   # TODO(y): this should be private ip
 65 |   hosts = [task.ip for task in job.tasks]
 66 |   host_str = ','.join(hosts)
 67 |   os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker')
 68 |   print(job.tasks[0].read('/tmp/out'))
 69 | 
 70 | 
 71 | def run_worker():
 72 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
 73 | 
 74 |   from mpi4py import MPI
 75 |   comm = MPI.COMM_WORLD
 76 |   rank = comm.Get_rank()
 77 | 
 78 |   if rank == 0:
 79 |     log = util.FileLogger('/tmp/out')
 80 |     #    log = util.FileLogger('/dev/null', mirror=False)
 81 | 
 82 |   else:
 83 |     log = util.FileLogger('/dev/null', mirror=False)
 84 |   grads_array = []
 85 | 
 86 |   time_list = []
 87 |   dim = args.size_mb*250*1000
 88 |   dtype = np.float32
 89 |   data = np.ones(dim, dtype=dtype)*(rank+1)
 90 |   for i in range(args.iters):
 91 |     start_time = time.perf_counter()
 92 |     if rank == 0:
 93 |       comm.Send(data, dest=1, tag=13)
 94 |     else:
 95 |       data = np.empty(dim, dtype=dtype)
 96 |       comm.Recv(data, source=0, tag=13)
 97 |       
 98 |     end_time = time.perf_counter()
 99 |     
100 |     elapsed_time_ms = (end_time - start_time) * 1000
101 |     time_list.append(elapsed_time_ms)
102 |     rate = args.size_mb / (elapsed_time_ms / 1000)
103 |     log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}'
104 |         f' ms: {rate:.2f} MB/second')
105 | 
106 |   min = np.min(time_list)
107 |   median = np.median(time_list)
108 | 
109 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
110 | 
111 | 
112 | def main():
113 |   # run local benchmark in launcher and launch service
114 |   if args.role == "launcher":
115 |     run_launcher()
116 |   elif args.role == "worker":
117 |     run_worker()
118 |   else:
119 |     assert False, 'unknown role'
120 | 
121 | 
122 | if __name__ == '__main__':
123 |   main()
124 | 


--------------------------------------------------------------------------------
/benchmarks/pytorch_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Run locally:
  4 | # ./pytorch_p2p.py
  5 | # 000/10 added 100 MBs in 35.0 ms: 2854.88 MB/second
  6 | # 001/10 added 100 MBs in 25.1 ms: 3979.37 MB/second
  7 | # 002/10 added 100 MBs in 25.4 ms: 3935.73 MB/second
  8 | # 003/10 added 100 MBs in 24.7 ms: 4040.93 MB/second
  9 | # 004/10 added 100 MBs in 24.4 ms: 4097.57 MB/second
 10 | # min:    21.58, median:    24.97, mean:    25.61
 11 | 
 12 | # To run on AWS:
 13 | # export NCLUSTER_IMAGE='Deep Learning AMI (Ubuntu) Version 15.0'
 14 | # export NCLUSTER_INSTANCE=c5.18xlarge
 15 | # python pytorch_p2p.py --aws
 16 | # 990/1000 added 100 MBs in 83.7 ms: 1194.35 MB/second
 17 | # 991/1000 added 100 MBs in 83.4 ms: 1198.78 MB/second
 18 | # 992/1000 added 100 MBs in 83.4 ms: 1198.73 MB/second
 19 | # 993/1000 added 100 MBs in 83.3 ms: 1201.20 MB/second
 20 | # 994/1000 added 100 MBs in 83.1 ms: 1203.84 MB/second
 21 | # 995/1000 added 100 MBs in 83.1 ms: 1203.04 MB/second
 22 | # 996/1000 added 100 MBs in 83.5 ms: 1197.38 MB/second
 23 | # 997/1000 added 100 MBs in 82.4 ms: 1213.99 MB/second
 24 | # 998/1000 added 100 MBs in 84.2 ms: 1187.69 MB/second
 25 | # 999/1000 added 100 MBs in 83.0 ms: 1204.13 MB/second
 26 | # min:    80.52, median:    83.25, mean:    83.29
 27 | 
 28 | import os
 29 | import sys
 30 | import time
 31 | import argparse
 32 | import util
 33 | 
 34 | parser = argparse.ArgumentParser(description='launch')
 35 | 
 36 | # launcher flags
 37 | parser.add_argument('--name', type=str, default='pytorch_two_machines',
 38 |                      help="name of the current run")
 39 | parser.add_argument('--size-mb', type=int, default=100,
 40 |                     help='size of data to send')
 41 | parser.add_argument('--iters', type=int, default=10,
 42 |                     help='how many iterations')
 43 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 44 | parser.add_argument('--image',
 45 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 46 | 
 47 | 
 48 | # mpi flags
 49 | parser.add_argument('--role', type=str, default='launcher',
 50 |                     help='internal flag, launcher or worker')
 51 | parser.add_argument('--rank', type=int, default=0,
 52 |                     help='mpi rank')
 53 | parser.add_argument('--size', type=int, default=0,
 54 |                     help='size of mpi world')
 55 | parser.add_argument('--master-addr', type=str, default='127.0.0.1',
 56 |                     help='address of master node')
 57 | parser.add_argument('--master-port', type=int, default=6006,
 58 |                     help='port of master node')
 59 | args = parser.parse_args()
 60 | 
 61 | def worker():
 62 |   """ Initialize the distributed environment. """
 63 | 
 64 |   import torch
 65 |   import torch.distributed as dist
 66 |   from torch.multiprocessing import Process
 67 |   import numpy as np
 68 | 
 69 |   print("Initializing distributed pytorch")
 70 |   os.environ['MASTER_ADDR'] = str(args.master_addr)
 71 |   os.environ['MASTER_PORT'] = str(args.master_port)
 72 |   # Use TCP backend. Gloo needs nightly, where it currently fails with
 73 |   #     dist.init_process_group('gloo', rank=args.rank,
 74 |   #   AttributeError: module 'torch.distributed' has no attribute 'init_process_group'
 75 |   dist.init_process_group('tcp', rank=args.rank,
 76 |                           world_size=args.size)
 77 | 
 78 |   tensor = torch.ones(args.size_mb*250*1000)*(args.rank+1)
 79 |   time_list = []
 80 |   outfile = 'out' if args.rank == 0 else '/dev/null'
 81 |   log = util.FileLogger(outfile)
 82 |   for i in range(args.iters):
 83 |     # print('before: rank ', args.rank, ' has data ', tensor[0])
 84 | 
 85 |     start_time = time.perf_counter()
 86 |     if args.rank == 0:
 87 |       dist.send(tensor=tensor, dst=1)
 88 |     else:
 89 |       dist.recv(tensor=tensor, src=0)
 90 |       
 91 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
 92 |     time_list.append(elapsed_time_ms)
 93 |     # print('after: rank ', args.rank, ' has data ', tensor[0])
 94 |     rate = args.size_mb/(elapsed_time_ms/1000)
 95 | 
 96 |     log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
 97 | 
 98 |   min = np.min(time_list)
 99 |   median = np.median(time_list)
100 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
101 | 
102 | 
103 | def launcher():
104 |   import ncluster
105 |   
106 |   if args.aws:
107 |     ncluster.set_backend('aws')
108 | 
109 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
110 |   job.upload(__file__)
111 |   job.upload('util.py')
112 | 
113 |   if args.aws:
114 |     job.run('source activate pytorch_p36')
115 |   else:
116 |     job.run('source deactivate')
117 |     job.run('source activate ncluster-test3')
118 | 
119 |   script_name = os.path.basename(__file__)
120 |   common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
121 |   job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args,
122 |                    non_blocking=True)
123 |   job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args,
124 |                    non_blocking=True)
125 | 
126 |   job.tasks[0].join()
127 |   print(job.tasks[0].read('out'))
128 |     
129 | 
130 | def main():
131 |   if args.role == "launcher":
132 |     launcher()
133 |   elif args.role == "worker":
134 |     worker()
135 |   else:
136 |     assert False, "Unknown role "+FLAGS.role
137 | 
138 |   
139 | if __name__ == "__main__":
140 |   main()
141 | 


--------------------------------------------------------------------------------
/benchmarks/ray_ps.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Ray parameter server benchmark
  4 | #
  5 | # python ray_ps.py --aws --num-ps=1 --num-workers=1 --size-mb=100 --iters=100
  6 | 
  7 | # # 1 worker, 1 ps
  8 | # min:    61.61, median:    63.77, mean:    69.20
  9 | 
 10 | # # 1 worker, 2 ps
 11 | # python ray_ps.py --aws --num-ps=2 --num-workers=1 --size-mb=100 --iters=100
 12 | # min:    49.45, median:    50.91, mean:    58.92
 13 | 
 14 | # # 1 worker, 4 ps
 15 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=100 --iters=100
 16 | # min:    47.98, median:    50.71, mean:    59.05
 17 | 
 18 | # # 4 worker, 4 ps
 19 | # python ray_ps.py --aws --num-ps=4 --num-workers=4 --size-mb=100 --iters=100
 20 | # 098/100 sent 400 MBs in 238.5 ms: 419.28 MB/second
 21 | # 099/100 sent 400 MBs in 242.0 ms: 413.22 MB/second
 22 | # min:   219.90, median:   241.51, mean:   245.95
 23 | # (54ms per worker since 4x more work done)
 24 | 
 25 | # # 1 worker, 4 ps, larger arrays
 26 | # python ray_ps.py --aws --num-ps=4 --num-workers=1 --size-mb=800 --iters=100
 27 | # min:   358.35, median:   544.59, mean:   513.47
 28 | #
 29 | # Bottom line, 50-60ms to send 100MB regardless of sharding/workers
 30 | 
 31 | import argparse
 32 | import os
 33 | import socket
 34 | import subprocess
 35 | import time
 36 | 
 37 | import numpy as np
 38 | import ray
 39 | 
 40 | import util
 41 | 
 42 | parser = argparse.ArgumentParser()
 43 | parser.add_argument("--role", default='launcher', type=str,
 44 |                     help="launcher/driver")
 45 | parser.add_argument('--image',
 46 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 47 | parser.add_argument("--size-mb", default=10, type=int,
 48 |                     help='how much data to send at each iteration')
 49 | parser.add_argument("--num-workers", default=2, type=int)
 50 | parser.add_argument("--num-ps", default=2, type=int)
 51 | 
 52 | parser.add_argument("--iters", default=11, type=int)
 53 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 54 | parser.add_argument("--xray", default=1, type=int,
 55 |                     help="whether to use XRay backend")
 56 | parser.add_argument('--nightly', default=1, type=int,
 57 |                     help='whether to use nightly version')
 58 | parser.add_argument('--name', default='ray_ps', type=str,
 59 |                     help='name of the run')
 60 | parser.add_argument("--ip", default='', type=str,
 61 |                     help="internal flag, used to point worker to head node")
 62 | args = parser.parse_args()
 63 | 
 64 | dim = args.size_mb * 250 * 1000 // args.num_ps
 65 | 
 66 | 
 67 | @ray.remote(resources={"worker": 1})
 68 | class Worker(object):
 69 |   def __init__(self):
 70 |     self.gradients = np.ones(dim, dtype=np.float32)
 71 | 
 72 |   @ray.method(num_return_vals=args.num_ps)
 73 |   def compute_gradients(self):
 74 |     if args.num_ps == 1:
 75 |       return self.gradients
 76 |     return [self.gradients]*args.num_ps
 77 | 
 78 |   def ip(self):
 79 |     return ray.services.get_node_ip_address()
 80 | 
 81 | 
 82 | @ray.remote(resources={"worker": 1})
 83 | class ParameterServer(object):
 84 |   def __init__(self):
 85 |     self.params = np.zeros(dim, dtype=np.float32)
 86 | 
 87 |   def receive(self, *grad_list):
 88 |     for grad in grad_list:
 89 |       self.params = grad  # use = just to get network overhead
 90 |     return self.params
 91 | 
 92 |   def get_weights(self):
 93 |     return self.params
 94 | 
 95 |   def ip(self):
 96 |     return ray.services.get_node_ip_address()
 97 | 
 98 | 
 99 | 
100 | def run_launcher():
101 |   import ncluster
102 | 
103 |   if args.aws:
104 |     ncluster.set_backend('aws')
105 | 
106 |   if args.nightly:
107 |     # running locally MacOS
108 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
109 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
110 |     else:
111 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
112 |   else:
113 |     install_script = 'pip install ray'
114 | 
115 |   job = ncluster.make_job(name=args.name,
116 |                           install_script=install_script,
117 |                           image_name=args.image,
118 |                           num_tasks=args.num_workers+args.num_ps)
119 |   if not ncluster.running_locally():
120 |     job._run_raw('killall python', ignore_errors=True)
121 |   
122 |   job.upload(__file__)
123 |   job.upload('util.py')
124 |   if args.xray:
125 |     job.run('export RAY_USE_XRAY=1')
126 |   job.run('ray stop')
127 | 
128 |   head = job.tasks[0]
129 | 
130 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
131 |   worker_resource = """--resources='{"worker": 1}'"""
132 |   head.run(f"ray start --head {worker_resource} --redis-port=6379")
133 | 
134 |   for task in job.tasks[1:]:
135 |     task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")
136 |   
137 |   head.run(f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}')
138 |   
139 |   print(head.read('out'))
140 | 
141 | 
142 | def transpose(list_of_lists):
143 |   return list(map(list, zip(*list_of_lists)))
144 | 
145 | 
146 | def run_driver():
147 |   ray.init(redis_address=args.ip)
148 | 
149 |   worker_actors = [Worker.remote() for _ in range(args.num_workers)]
150 |   ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)]
151 |   
152 |   log = util.FileLogger('out')
153 | 
154 |   time_list = []
155 |   for i in range(args.iters):
156 |     start_time = time.perf_counter()
157 |     grads_list = []
158 |     for actor in worker_actors:
159 |       result = actor.compute_gradients.remote()
160 |       if args.num_ps == 1:
161 |         grads_list.append([result])
162 |       else:
163 |         grads_list.append(result)
164 |     
165 |     updates = []
166 |     for ps, shards in zip(ps_actors, transpose(grads_list)):
167 |       updates.append(ps.receive.remote(*shards))
168 |     
169 |     ray.wait(updates, num_returns=args.num_ps)
170 |     
171 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
172 |     time_list.append(elapsed_time_ms)
173 |     rate = args.size_mb / (elapsed_time_ms/1000)
174 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb*args.num_workers, elapsed_time_ms, rate))
175 |     
176 |   min = np.min(time_list)
177 |   median = np.median(time_list)
178 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
179 | 
180 | 
181 | def main():
182 |   if args.role == 'launcher':
183 |     run_launcher()
184 |   elif args.role == 'driver':
185 |     run_driver()
186 |   else:
187 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
188 | 
189 | 
190 | if __name__ == '__main__':
191 |   main()
192 | 


--------------------------------------------------------------------------------
/benchmarks/ray_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Example of two process Ray program, worker sends values to parameter
  4 | # server on a different machine
  5 | #
  6 | # Run locally:
  7 | # ./ray_two_machines.py
  8 | #
  9 | # Run on AWS:
 10 | # ./ray_two_machines.py --aws
 11 | 
 12 | 
 13 | # Example timings
 14 | # c5.18xlarge over network: over network: 63.0 ms: 1586.76 MB/second
 15 | # c5.9xlarge over network: 399/400 added 100 MBs in 85.5 ms: 1170.26 MB/second
 16 | # c5.18xlarge locally: 86 ms, 1218 MB/seconds (9.7 Gbps)
 17 | # macbook pro locally: 978.9 ms, 102.15 MB/second
 18 | 
 19 | # c5.18xlarge
 20 | # 004/11 sent 100 MBs in 69.4 ms: 1440.31 MB/second
 21 | # 005/11 sent 100 MBs in 68.1 ms: 1468.95 MB/second
 22 | # 006/11 sent 100 MBs in 70.4 ms: 1421.40 MB/second
 23 | # 007/11 sent 100 MBs in 69.5 ms: 1438.62 MB/second
 24 | # 008/11 sent 100 MBs in 66.4 ms: 1506.90 MB/second
 25 | # 009/11 sent 100 MBs in 76.5 ms: 1306.92 MB/second
 26 | # 010/11 sent 100 MBs in 66.8 ms: 1497.64 MB/second
 27 | # min:    66.36, median:    69.43, mean:    70.55
 28 | 
 29 | # Another run
 30 | # 989/1000 sent 100 MBs in 54.6 ms: 1831.07 MB/second
 31 | # 990/1000 sent 100 MBs in 54.4 ms: 1837.20 MB/second
 32 | # 991/1000 sent 100 MBs in 54.8 ms: 1824.91 MB/second
 33 | # 992/1000 sent 100 MBs in 53.4 ms: 1874.39 MB/second
 34 | # 993/1000 sent 100 MBs in 53.1 ms: 1881.77 MB/second
 35 | # 994/1000 sent 100 MBs in 52.7 ms: 1897.76 MB/second
 36 | # 995/1000 sent 100 MBs in 55.4 ms: 1805.42 MB/second
 37 | # 996/1000 sent 100 MBs in 53.4 ms: 1872.93 MB/second
 38 | # 997/1000 sent 100 MBs in 52.7 ms: 1896.65 MB/second
 39 | # 998/1000 sent 100 MBs in 54.0 ms: 1851.14 MB/second
 40 | # 999/1000 sent 100 MBs in 53.6 ms: 1864.93 MB/second
 41 | # min:    51.11, median:    55.45, mean:    60.74
 42 | 
 43 | 
 44 | # Bottom line: 30ms locally, 60ms over network
 45 | 
 46 | import argparse
 47 | import os
 48 | import socket
 49 | import subprocess
 50 | import time
 51 | 
 52 | import numpy as np
 53 | import ray
 54 | 
 55 | import util
 56 | 
 57 | parser = argparse.ArgumentParser()
 58 | parser.add_argument("--role", default='launcher', type=str,
 59 |                     help="launcher/driver")
 60 | parser.add_argument('--image',
 61 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 62 | parser.add_argument("--size-mb", default=100, type=int,
 63 |                     help='how much data to send at each iteration')
 64 | parser.add_argument("--iters", default=11, type=int)
 65 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 66 | parser.add_argument("--xray", default=1, type=int,
 67 |                     help="whether to use XRay backend")
 68 | parser.add_argument('--nightly', default=1, type=int,
 69 |                     help='whether to use nightly version')
 70 | parser.add_argument('--name', default='ray_two_machines', type=str,
 71 |                     help='name of the run')
 72 | parser.add_argument("--ip", default='', type=str,
 73 |                     help="internal flag, used to point worker to head node")
 74 | args = parser.parse_args()
 75 | 
 76 | dim = args.size_mb * 250 * 1000
 77 | 
 78 | 
 79 | @ray.remote(resources={"worker": 1})
 80 | class Worker(object):
 81 |   def __init__(self):
 82 |     self.gradients = np.ones(dim, dtype=np.float32)
 83 | 
 84 |   def compute_gradients(self):
 85 |     return self.gradients
 86 | 
 87 |   def ip(self):
 88 |     return ray.services.get_node_ip_address()
 89 | 
 90 | 
 91 | @ray.remote(resources={"ps": 1})
 92 | class ParameterServer(object):
 93 |   def __init__(self):
 94 |     self.params = np.zeros(dim, dtype=np.float32)
 95 | 
 96 |   def receive(self, grad):
 97 |     self.params = grad  # use = just to get network overhead
 98 |     return self.params
 99 | 
100 |   def get_weights(self):
101 |     return self.params
102 | 
103 |   def ip(self):
104 |     return ray.services.get_node_ip_address()
105 | 
106 | 
107 | 
108 | def run_launcher():
109 |   import ncluster
110 | 
111 |   if args.aws:
112 |     ncluster.set_backend('aws')
113 | 
114 |   if args.nightly:
115 |     # running locally MacOS
116 |     print(f"asdfasdf {util.ossystem('uname')}")
117 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
118 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
119 |       print(f"asdfasdf got install script {install_script}")
120 |     else:
121 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
122 |   else:
123 |     install_script = 'pip install ray'
124 | 
125 |   job = ncluster.make_job(name=args.name,
126 |                           install_script=install_script,
127 |                           image_name=args.image,
128 |                           num_tasks=2)
129 |   ps, worker = job.tasks
130 |   if not ncluster.running_locally():
131 |     ps._run_raw('killall python', ignore_errors=True)
132 |     worker._run_raw('killall python', ignore_errors=True)
133 |   
134 |   job.upload(__file__)
135 |   job.upload('util.py')
136 |   if args.xray:
137 |     job.run('export RAY_USE_XRAY=1')
138 |   job.run('ray stop')
139 | 
140 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
141 |   ps_resource = """--resources='{"ps": 1}'"""
142 |   worker_resource = """--resources='{"worker": 1}'"""
143 |   
144 |   ps.run(f"ray start --head {ps_resource} --redis-port=6379")
145 |   worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
146 |   worker.run(
147 |     f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
148 |   print(worker.read('out'))
149 | 
150 | 
151 | def run_driver():
152 |   ray.init(redis_address=args.ip)
153 | 
154 |   worker = Worker.remote()
155 |   ps = ParameterServer.remote()
156 |   log = util.FileLogger('out')
157 |   log(f"Worker ip {ray.get(worker.ip.remote())}")
158 |   log(f"PS ip {ray.get(ps.ip.remote())}")
159 |   log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")
160 | 
161 |   time_list = []
162 |   for i in range(args.iters):
163 |     start_time = time.perf_counter()
164 |     grads = worker.compute_gradients.remote()
165 |     result = ps.receive.remote(grads)
166 |     ray.wait([result])
167 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
168 |     time_list.append(elapsed_time_ms)
169 |     rate = args.size_mb / (elapsed_time_ms/1000)
170 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
171 |     
172 |   min = np.min(time_list)
173 |   median = np.median(time_list)
174 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
175 | 
176 | 
177 | def main():
178 |   if args.role == 'launcher':
179 |     run_launcher()
180 |   elif args.role == 'driver':
181 |     run_driver()
182 |   else:
183 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
184 | 
185 | 
186 | if __name__ == '__main__':
187 |   main()
188 | 


--------------------------------------------------------------------------------
/benchmarks/ray_two_machines_local.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Runs two machine benchmark locally on AWS machine
  4 | #
  5 | # Example timings
  6 | # macbook: added 10 MBs in 14.1 ms: 707.68 MB/second
  7 | # c5.18xlarge: added 10 MBs in 4.4 ms: 2298.82 MB/second
  8 | #      091/100 added 100 MBs in 30.8 ms: 3246.44 MB/second
  9 | 
 10 | # Bottom line: can do 3.2 GB/second running locally, 800 
 11 | import argparse
 12 | import os
 13 | import socket
 14 | import subprocess
 15 | import time
 16 | 
 17 | import numpy as np
 18 | import ray
 19 | 
 20 | import util
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument('--image',
 24 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 25 | parser.add_argument("--size-mb", default=100, type=int,
 26 |                     help='how much data to send at each iteration')
 27 | parser.add_argument("--iters", default=11, type=int)
 28 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 29 | parser.add_argument("--xray", default=1, type=int,
 30 |                     help="whether to use XRay backend")
 31 | parser.add_argument('--nightly', default=1, type=int,
 32 |                     help='whether to use nightly version')
 33 | parser.add_argument('--name', default='ray_two_machines', type=str,
 34 |                     help='name of the run')
 35 | 
 36 | parser.add_argument("--ip", default='', type=str,
 37 |                     help="internal flag, used to point worker to head node")
 38 | parser.add_argument("--role", default='launcher', type=str,
 39 |                     help="interanl flag, launcher/driver")
 40 | args = parser.parse_args()
 41 | 
 42 | dim = args.size_mb * 250 * 1000
 43 | 
 44 | 
 45 | @ray.remote(resources={"worker": 1})
 46 | class Worker(object):
 47 |   def __init__(self):
 48 |     self.gradients = np.ones(dim, dtype=np.float32)
 49 | 
 50 |   def compute_gradients(self):
 51 |     return self.gradients
 52 | 
 53 |   def ip(self):
 54 |     return ray.services.get_node_ip_address()
 55 | 
 56 | 
 57 | @ray.remote(resources={"ps": 1})
 58 | class ParameterServer(object):
 59 |   def __init__(self):
 60 |     self.params = np.zeros(dim, dtype=np.float32)
 61 | 
 62 |   def assign_add(self, grad):
 63 |     self.params = grad  # use = just to get network overhead
 64 |     return self.params
 65 | 
 66 |   def get_weights(self):
 67 |     return self.params
 68 | 
 69 |   def ip(self):
 70 |     return ray.services.get_node_ip_address()
 71 | 
 72 | 
 73 | 
 74 | def run_launcher():
 75 |   import ncluster
 76 | 
 77 |   if args.aws:
 78 |     ncluster.set_backend('aws')
 79 | 
 80 |   if args.nightly:
 81 |     # running locally MacOS
 82 |     if 'Darwin' in util.ossystem('uname') and not args.aws:
 83 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
 84 |     else:
 85 |       install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
 86 |   else:
 87 |     install_script = 'pip install ray'
 88 | 
 89 |   worker = ncluster.make_task(name=args.name,
 90 |                             install_script=install_script,
 91 |                             image_name=args.image)
 92 |   if not ncluster.running_locally():
 93 |     worker._run_raw('killall python', ignore_errors=True)
 94 |   worker.upload(__file__)
 95 |   worker.upload('util.py')
 96 |   if args.xray:
 97 |     worker.run('export RAY_USE_XRAY=1')
 98 |   worker.run('ray stop')
 99 | 
100 |   resources = """--resources='{"ps": 1, "worker": 1}'"""
101 |   worker.run(f"ray start --head {resources} --redis-port=6379")
102 |   #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
103 |   worker.run(
104 |     f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
105 |   print(worker.read('out'))
106 | 
107 | 
108 | def run_driver():
109 |   ray.init(redis_address=args.ip)
110 | 
111 |   worker = Worker.remote()
112 |   ps = ParameterServer.remote()
113 |   log = util.FileLogger('out')
114 |   log(f"Worker ip {ray.get(worker.ip.remote())}")
115 |   log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")
116 | 
117 |   time_list = []
118 |   for i in range(args.iters):
119 |     start_time = time.perf_counter()
120 |     grads = worker.compute_gradients.remote()
121 |     result = ps.assign_add.remote(grads)
122 |     result = ray.get(result)[0]
123 |     elapsed_time_ms = (time.perf_counter() - start_time)*1000
124 |     time_list.append(elapsed_time_ms)
125 |     rate = args.size_mb / (elapsed_time_ms/1000)
126 |     log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
127 |     
128 |   min = np.min(time_list)
129 |   median = np.median(time_list)
130 |   log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
131 | 
132 | 
133 | def main():
134 |   if args.role == 'launcher':
135 |     run_launcher()
136 |   elif args.role == 'driver':
137 |     run_driver()
138 |   else:
139 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
140 | 
141 | 
142 | if __name__ == '__main__':
143 |   main()
144 | 


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | ray
2 | torch
3 | tensorflow
4 | 


--------------------------------------------------------------------------------
/benchmarks/summary.txt:
--------------------------------------------------------------------------------
 1 | tf_two_machines -- 500 on t3, 910 on c3
 2 | 
 3 | Ray can do
 4 | 30ms on local transfers, 60ms on AWS c5.18xlarge
 5 | Using multiple ps shards, can do 48ms on AWS
 6 | 
 7 | 
 8 | 40ms on unoptimized PyTorch clone
 9 | 2.7ms for optimized memcpy on skylake: 300 Gbps (37 GB/second, close to memory bandwidth) -- https://www.google.com/url?q=https://www.anandtech.com/show/11544/intel-skylake-ep-vs-amd-epyc-7000-cpu-battle-of-the-decade/12&source=gmail&ust=1537921524487000&usg=AFQjCNGUrAScjR_rAihauUr-nj5TMg-VKQ
10 | 
11 | 
12 | PyTorch backend can do 20 Gbps per thread on 
13 | 


--------------------------------------------------------------------------------
/benchmarks/tf_two_machines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver.
  6 | 
  7 | To run locally:
  8 | ./tf_two_machines.py
  9 | Should see something like this
 10 | 
 11 | ```
 12 | 005/11 added 100 MBs in 78.9 ms: 1266.98 MB/second
 13 | 006/11 added 100 MBs in 78.1 ms: 1280.07 MB/second
 14 | 007/11 added 100 MBs in 78.1 ms: 1280.56 MB/second
 15 | 008/11 added 100 MBs in 81.8 ms: 1222.76 MB/second
 16 | 009/11 added 100 MBs in 79.5 ms: 1258.54 MB/second
 17 | 010/11 added 100 MBs in 76.6 ms: 1305.64 MB/second
 18 | min:    76.59, median:    78.80, mean:    88.34
 19 | ```
 20 | 
 21 | To interact with task 1 (the driver), do "tmux a -t 1"
 22 | 
 23 | To run on AWS
 24 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 25 | ./tf_two_machines.py --aws
 26 | 
 27 | Should see something like this with t3.large instances
 28 | ```
 29 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second
 30 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second
 31 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second
 32 | ```
 33 | 
 34 | Running c5.18xlarge machines with more iterations
 35 | 007/11 sent 100 MBs in 135.4 ms: 738.47 MB/second
 36 | 008/11 sent 100 MBs in 133.0 ms: 752.04 MB/second
 37 | 009/11 sent 100 MBs in 133.8 ms: 747.48 MB/second
 38 | 010/11 sent 100 MBs in 136.3 ms: 733.77 MB/second
 39 | min:   132.97, median:   134.98, mean:   137.27
 40 | 
 41 | 
 42 | Can use more shards
 43 | ./tf_two_machines.py --aws --shards=8 --iters=1000
 44 | 994/1000 sent 100 MBs in 87.0 ms: 1149.50 MB/second
 45 | 995/1000 sent 100 MBs in 87.0 ms: 1149.21 MB/second
 46 | 996/1000 sent 100 MBs in 86.8 ms: 1152.11 MB/second
 47 | 997/1000 sent 100 MBs in 89.8 ms: 1113.89 MB/second
 48 | 998/1000 sent 100 MBs in 87.9 ms: 1137.37 MB/second
 49 | 999/1000 sent 100 MBs in 88.0 ms: 1135.80 MB/second
 50 | min:    86.12, median:    88.48, mean:    89.51
 51 | 
 52 | 
 53 | To connect and interact with the job look for SSH instructions like this
 54 |    To connect to 0.tf_two_machines
 55 |    ssh -i /Users/yaroslav/.ncluster/ncluster2-yaroslav-316880547378-us-east-1.pem -o StrictHostKeyChecking=no ubuntu@18.234.30.222
 56 | 
 57 | ssh into the instance following these instructions, then run "tmux a"
 58 | 
 59 | 
 60 | """
 61 | 
 62 | import argparse
 63 | import json
 64 | import os
 65 | import numpy as np
 66 | import tensorflow as tf
 67 | import time
 68 | 
 69 | import util
 70 | 
 71 | parser = argparse.ArgumentParser()
 72 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 73 | parser.add_argument("--iters", default=11, type=int,
 74 |                     help="Maximum number of additions")
 75 | parser.add_argument("--size-mb", default=100, type=int,
 76 |                     help="size of vector in MBs")
 77 | parser.add_argument("--shards", default=1, type=int,
 78 |                     help="how many ways to shard the variable")
 79 | parser.add_argument('--image',
 80 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 81 | parser.add_argument('--name',
 82 |                     default='tf_two_machines')
 83 | 
 84 | # internal flags
 85 | parser.add_argument('--role', default='launcher', type=str)
 86 | parser.add_argument("--sender-ip", default='127.0.0.1')
 87 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 88 | args = parser.parse_args()
 89 | 
 90 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 91 |                 'receiver': [args.receiver_ip + ':32301']}
 92 | 
 93 | 
 94 | def _launch_server(role):
 95 |   os.environ['TF_CONFIG'] = json.dumps(
 96 |     {'cluster': cluster_spec,
 97 |      'task': {'type': role, 'index': 0}})
 98 |   config = tf.estimator.RunConfig()
 99 |   return tf.train.Server(config.cluster_spec,
100 |                          job_name=config.task_type,
101 |                          task_index=config.task_id)
102 | 
103 | 
104 | def run_launcher():
105 |   import ncluster
106 |   if args.aws:
107 |     ncluster.set_backend('aws')
108 | 
109 |   job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
110 |   job.upload(__file__)
111 |   job.upload('util.py')
112 | 
113 |   sender, receiver = job.tasks
114 |   # kill python just for when tmux session reuse is on
115 |   if not ncluster.running_locally():
116 |     sender._run_raw('killall python', ignore_errors=True)
117 |     receiver._run_raw('killall python', ignore_errors=True)
118 | 
119 |   if ncluster.get_backend() == 'aws':
120 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
121 |     job.run('source activate tensorflow_p36')
122 | 
123 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
124 |   receiver.run(f'python {__file__} --role=receiver {ip_config}',
125 |                non_blocking=True)
126 |   sender.run(
127 |     f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
128 |   print(sender.read('out'))
129 | 
130 | 
131 | def run_receiver():
132 |   server = _launch_server('receiver')
133 |   time.sleep(365 * 24 * 3600)
134 |   del server
135 | 
136 | 
137 | def run_sender():
138 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
139 |   log = util.FileLogger('out')
140 |   grads_array = []
141 |   with tf.device('/job:chief/task:0'):
142 |     #    grads = tf.fill([param_size], 1.)
143 |     for i in range(args.shards):
144 |       grads = tf.Variable(tf.ones([param_size]))
145 |       grads_array.append(grads)
146 | 
147 |   params_array = []
148 |   add_op_array = []
149 |   with tf.device('/job:receiver/task:0'):
150 |     for i in range(args.shards):
151 |       params = tf.Variable(tf.ones([param_size]))
152 |       add_op = params.assign(grads_array[i]).op
153 |       params_array.append(params)
154 |       add_op_array.append(add_op)
155 |     add_op = tf.group(*add_op_array)
156 |     
157 |   server = _launch_server('chief')
158 |   sess = tf.Session(server.target)
159 |   sess.run(tf.global_variables_initializer())
160 |     # except Exception as e:
161 |     #   # sometimes .run fails with .UnavailableError: OS Error
162 |     #   log(f"initialization failed with {e}, retrying in 1 second")
163 |     #   time.sleep(1)
164 | 
165 |   time_list = []
166 |   for i in range(args.iters):
167 |     start_time = time.perf_counter()
168 |     sess.run(add_op)
169 |     elapsed_time_ms = (time.perf_counter() - start_time) * 1000
170 |     time_list.append(elapsed_time_ms)
171 |     rate = args.size_mb / (elapsed_time_ms / 1000)
172 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (
173 |       i, args.iters, args.size_mb, elapsed_time_ms, rate))
174 | 
175 |   min = np.min(time_list)
176 |   median = np.median(time_list)
177 | 
178 |   log(
179 |     f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
180 | 
181 | 
182 | def main():
183 |   # run local benchmark in launcher and launch service
184 |   if args.role == "launcher":
185 |     run_launcher()
186 |   elif args.role == "sender":
187 |     run_sender()
188 |   elif args.role == "receiver":
189 |     run_receiver()
190 |   else:
191 |     assert False, 'unknown role'
192 | 
193 | 
194 | if __name__ == '__main__':
195 |   main()
196 | 


--------------------------------------------------------------------------------
/benchmarks/tf_two_machines_local.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Runs distributed benchmark on a single machine remotely
  4 | 
  5 | Adding 100MB buffers
  6 | 
  7 | # 1 shard: 88ms
  8 | # 4 shards: 56ms
  9 | # 8 shards: 51ms
 10 | # 16 shards: 55ms
 11 | 
 12 | # increase size 8x
 13 | python tf_two_machines_local.py --shards=8 --iters=100 --size-mb=800 --aws
 14 | # 416ms
 15 | 
 16 | Bottom line: 1.6-1.9 GB/second when running locally
 17 | Going 1->4 shards saves 30%, 4->8 shards another 5%
 18 | 
 19 | i3.metal 30% slower than c5.18xlarge
 20 | 
 21 | """
 22 | 
 23 | import argparse
 24 | import json
 25 | import os
 26 | import numpy as np
 27 | import tensorflow as tf
 28 | import time
 29 | 
 30 | import util
 31 | 
 32 | parser = argparse.ArgumentParser()
 33 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 34 | parser.add_argument("--iters", default=11, type=int,
 35 |                     help="Maximum number of additions")
 36 | parser.add_argument("--size-mb", default=100, type=int,
 37 |                     help="size of vector in MBs")
 38 | parser.add_argument("--shards", default=1, type=int,
 39 |                     help="how many ways to shard the variable")
 40 | parser.add_argument('--image',
 41 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 42 | parser.add_argument('--name',
 43 |                     default='tf_two_machines_local')
 44 | 
 45 | # internal flags
 46 | parser.add_argument('--role', default='launcher', type=str)
 47 | parser.add_argument("--sender-ip", default='127.0.0.1')
 48 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 49 | args = parser.parse_args()
 50 | 
 51 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 52 |                 'receiver': [args.receiver_ip + ':32301']}
 53 | 
 54 | 
 55 | def _launch_server(role):
 56 |   os.environ['TF_CONFIG'] = json.dumps(
 57 |     {'cluster': cluster_spec,
 58 |      'task': {'type': role, 'index': 0}})
 59 |   config = tf.estimator.RunConfig()
 60 |   return tf.train.Server(config.cluster_spec,
 61 |                          job_name=config.task_type,
 62 |                          task_index=config.task_id)
 63 | 
 64 | 
 65 | def run_launcher():
 66 |   import ncluster
 67 |   ncluster.util.assert_script_in_current_directory()
 68 |   
 69 |   if args.aws:
 70 |     ncluster.set_backend('aws')
 71 | 
 72 |   # use 4GB instance, 0.5GB not enough
 73 |   worker = ncluster.make_task(args.name, image_name=args.image,
 74 |                               instance_type='t3.medium')
 75 |   worker.upload(__file__)
 76 |   worker.upload('util.py')
 77 | 
 78 |   # kill python just for when tmux session reuse is on
 79 |   if not ncluster.running_locally():
 80 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 81 |     worker._run_raw('killall python', ignore_errors=True)
 82 |     worker.run('source activate tensorflow_p36')
 83 | 
 84 |   ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
 85 |   worker.run(f'python {__file__} --role=receiver {ip_config}',
 86 |                non_blocking=True)
 87 |   worker.switch_window(1)  # run in new tmux window
 88 |   if not ncluster.running_locally():
 89 |     worker.run('source activate tensorflow_p36')
 90 |   worker.run(
 91 |     f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
 92 |   print(worker.read('out'))
 93 | 
 94 | 
 95 | def run_receiver():
 96 |   server = _launch_server('receiver')
 97 |   time.sleep(365 * 24 * 3600)
 98 |   del server
 99 | 
100 | 
101 | def run_sender():
102 |   param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
103 |   log = util.FileLogger('out')
104 |   grads_array = []
105 |   with tf.device('/job:chief/task:0'):
106 |     #    grads = tf.fill([param_size], 1.)
107 |     for i in range(args.shards):
108 |       grads = tf.Variable(tf.ones([param_size]))
109 |       grads_array.append(grads)
110 | 
111 |   params_array = []
112 |   add_op_array = []
113 |   with tf.device('/job:receiver/task:0'):
114 |     for i in range(args.shards):
115 |       params = tf.Variable(tf.ones([param_size]))
116 |       add_op = params.assign(grads_array[i]).op
117 |       params_array.append(params)
118 |       add_op_array.append(add_op)
119 |     add_op = tf.group(*add_op_array)
120 |     
121 |   server = _launch_server('chief')
122 |   sess = tf.Session(server.target)
123 |   sess.run(tf.global_variables_initializer())
124 |     # except Exception as e:
125 |     #   # sometimes .run fails with .UnavailableError: OS Error
126 |     #   log(f"initialization failed with {e}, retrying in 1 second")
127 |     #   time.sleep(1)
128 | 
129 |   time_list = []
130 |   for i in range(args.iters):
131 |     start_time = time.perf_counter()
132 |     sess.run(add_op)
133 |     elapsed_time_ms = (time.perf_counter() - start_time) * 1000
134 |     time_list.append(elapsed_time_ms)
135 |     rate = args.size_mb / (elapsed_time_ms / 1000)
136 |     log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (
137 |       i, args.iters, args.size_mb, elapsed_time_ms, rate))
138 | 
139 |   min = np.min(time_list)
140 |   median = np.median(time_list)
141 | 
142 |   log(
143 |     f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
144 | 
145 | 
146 | def main():
147 |   # run local benchmark in launcher and launch service  
148 |   if args.role == "launcher":
149 |     run_launcher()
150 |   elif args.role == "sender":
151 |     run_sender()
152 |   elif args.role == "receiver":
153 |     run_receiver()
154 |   else:
155 |     assert False, 'unknown role'
156 | 
157 | 
158 | if __name__ == '__main__':
159 |   main()
160 | 


--------------------------------------------------------------------------------
/benchmarks/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | 
 5 | 
 6 | class FileLogger:
 7 |   """Helper class to log to file (possibly mirroring to stderr)
 8 |      logger = FileLogger('somefile.txt')
 9 |      logger = FileLogger('somefile.txt', mirror=True)
10 |      logger('somemessage')
11 |      logger('somemessage: %s %.2f', 'value', 2.5)
12 |   """
13 | 
14 |   def __init__(self, fn, mirror=True):
15 |     self.fn = fn
16 |     self.f = open(fn, 'w')
17 |     self.mirror = mirror
18 |     print(f"Creating FileLogger on {os.path.abspath(fn)}")
19 | 
20 |   def __call__(self, s='', *args):
21 |     """Either ('asdf %f', 5) or (val1, val2, val3, ...)"""
22 |     if (isinstance(s, str) or isinstance(s, bytes)) and '%' in s:
23 |       formatted_s = s % args
24 |     else:
25 |       toks = [s] + list(args)
26 |       formatted_s = ', '.join(str(s) for s in toks)
27 | 
28 |     self.f.write(formatted_s + '\n')
29 |     self.f.flush()
30 |     if self.mirror:
31 |       # use manual flushing because "|" makes output 4k buffered instead of
32 |       # line-buffered
33 |       sys.stdout.write(formatted_s+'\n')
34 |       sys.stdout.flush()
35 | 
36 |   def __del__(self):
37 |     self.f.close()
38 | 
39 | 
40 | def ossystem(cmd):
41 |   """Like os.system, but returns output of command as string."""
42 |   p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
43 |                        stderr=subprocess.STDOUT)
44 |   (stdout, stderr) = p.communicate()
45 |   return stdout.decode('ascii') if stdout else ''
46 | 


--------------------------------------------------------------------------------
/examples/deleteme.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | print(sys.argv[0])
4 | 


--------------------------------------------------------------------------------
/examples/gpubox.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Launch a single GPU instance with jupyter notebook
 4 | 
 5 | import argparse
 6 | import os
 7 | import ncluster
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--name', type=str, default='gpubox',
11 |                     help="instance name")
12 | parser.add_argument('--image-name', type=str,
13 |                     default='Deep Learning AMI (Ubuntu) Version 23.0',
14 |                     help="name of AMI to use ")
15 | parser.add_argument('--instance-type', type=str, default='p3.2xlarge',
16 |                     help="type of instance")
17 | parser.add_argument('--password',
18 |                     default='DefaultNotebookPasswordPleaseChange',
19 |                     help='password to use for jupyter notebook')
20 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
21 | 
22 | args = parser.parse_args()
23 | module_path = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | def main():
26 |   task = ncluster.make_task(name=args.name,
27 |                             instance_type=args.instance_type,
28 |                             image_name=args.image_name)
29 | 
30 |   # upload notebook config with provided password
31 |   jupyter_config_fn = _create_jupyter_config(args.password)
32 |   remote_config_fn = '~/.jupyter/jupyter_notebook_config.py'
33 |   task.upload(jupyter_config_fn, remote_config_fn)
34 | 
35 |   # upload sample notebook and start Jupyter server
36 |   task.run('mkdir -p /ncluster/notebooks')
37 |   task.upload(f'{module_path}/gpubox_sample.ipynb',
38 |               '/ncluster/notebooks/gpubox_sample.ipynb',
39 |               dont_overwrite=True)
40 |   task.run('cd /ncluster/notebooks')
41 |   task.run('jupyter notebook', non_blocking=True)
42 |   print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
43 | 
44 | 
45 | def _create_jupyter_config(password):
46 |   from notebook.auth import passwd
47 |   sha = passwd(args.password)
48 |   local_config_fn = f'{module_path}/gpubox_jupyter_notebook_config.py'
49 |   temp_config_fn = '/tmp/' + os.path.basename(local_config_fn)
50 |   os.system(f'cp {local_config_fn} {temp_config_fn}')
51 |   _replace_lines(temp_config_fn, 'c.NotebookApp.password',
52 |                  f"c.NotebookApp.password = '{sha}'")
53 |   return temp_config_fn
54 | 
55 | 
56 | def _replace_lines(fn, startswith, new_line):
57 |   """Replace lines starting with starts_with in fn with new_line."""
58 |   new_lines = []
59 |   for line in open(fn):
60 |     if line.startswith(startswith):
61 |       new_lines.append(new_line)
62 |     else:
63 |       new_lines.append(line)
64 |   with open(fn, 'w') as f:
65 |     f.write('\n'.join(new_lines))
66 | 
67 | 
68 | if __name__ == '__main__':
69 |   main()
70 | 


--------------------------------------------------------------------------------
/examples/gpubox_sample.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Mon Aug 13 23:41:40 2018       \r\n",
13 |       "+-----------------------------------------------------------------------------+\r\n",
14 |       "| NVIDIA-SMI 396.37                 Driver Version: 396.37                    |\r\n",
15 |       "|-------------------------------+----------------------+----------------------+\r\n",
16 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
17 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
18 |       "|===============================+======================+======================|\r\n",
19 |       "|   0  Tesla M60           On   | 00000000:00:1E.0 Off |                    0 |\r\n",
20 |       "| N/A   43C    P8    14W / 150W |      0MiB /  7618MiB |      0%      Default |\r\n",
21 |       "+-------------------------------+----------------------+----------------------+\r\n",
22 |       "                                                                               \r\n",
23 |       "+-----------------------------------------------------------------------------+\r\n",
24 |       "| Processes:                                                       GPU Memory |\r\n",
25 |       "|  GPU       PID   Type   Process name                             Usage      |\r\n",
26 |       "|=============================================================================|\r\n",
27 |       "|  No running processes found                                                 |\r\n",
28 |       "+-----------------------------------------------------------------------------+\r\n"
29 |      ]
30 |     }
31 |    ],
32 |    "source": [
33 |     "!nvidia-smi"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python [default]",
47 |    "language": "python",
48 |    "name": "python3"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 3
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython3",
60 |    "version": "3.6.4"
61 |   },
62 |   "toc": {
63 |    "colors": {
64 |     "hover_highlight": "#DAA520",
65 |     "running_highlight": "#FF0000",
66 |     "selected_highlight": "#FFD700"
67 |    },
68 |    "moveMenuLeft": true,
69 |    "nav_menu": {
70 |     "height": "12px",
71 |     "width": "252px"
72 |    },
73 |    "navigate_menu": true,
74 |    "number_sections": true,
75 |    "sideBar": true,
76 |    "threshold": 4,
77 |    "toc_cell": false,
78 |    "toc_section_display": "block",
79 |    "toc_window_display": false
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 2
84 | }
85 | 


--------------------------------------------------------------------------------
/examples/launch_16_instances.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import time
 3 | 
 4 | def main():
 5 |   ncluster.set_backend('aws')
 6 |   
 7 |   start_time = time.time()
 8 |   job = ncluster.make_job(num_tasks=16)
 9 |   print(f"waited for startup for {time.time()-start_time} seconds")
10 | 
11 |   start_time = time.time()
12 |   job.run('sleep 10')
13 |   print(f"waited for exec for {time.time()-start_time} seconds")
14 | 
15 | if __name__ == '__main__':
16 |   main()
17 | 


--------------------------------------------------------------------------------
/examples/ray_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Example of two process Ray program, worker sends values to parameter
  4 | # server on a different machine
  5 | #
  6 | # Run locally:
  7 | # ./ray_example.py
  8 | #
  9 | # Run on AWS:
 10 | # ./ray_example.py --aws
 11 | 
 12 | import argparse
 13 | import os
 14 | import time
 15 | 
 16 | import numpy as np
 17 | import ray
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument("--role", default='launcher', type=str,
 21 |                     help="launcher/driver")
 22 | parser.add_argument('--image', default='Deep Learning AMI (Ubuntu) Version 13.0')
 23 | parser.add_argument("--size-mb", default=10, type=int, help='how much data to send at each iteration')
 24 | parser.add_argument("--iters", default=10, type=int)
 25 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 26 | parser.add_argument("--ip", default='', type=str,
 27 |                     help="internal flag, used to point worker to head node")
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | dim = args.size_mb * 250 * 1000
 32 | 
 33 | 
 34 | @ray.remote(resources={"worker": 1})
 35 | class Worker(object):
 36 |   def __init__(self):
 37 |     self.gradients = np.ones(dim, dtype=np.float32)
 38 | 
 39 |   def compute_gradients(self):
 40 |     return self.gradients
 41 | 
 42 | 
 43 | @ray.remote(resources={"ps": 1})
 44 | class ParameterServer(object):
 45 |   def __init__(self):
 46 |     self.params = np.zeros(dim, dtype=np.float32)
 47 | 
 48 |   def assign_add(self, grad):
 49 |     self.params += grad
 50 |     return self.params
 51 | 
 52 |   def get_weights(self):
 53 |     return self.params
 54 | 
 55 | 
 56 | def run_launcher():
 57 |   import ncluster
 58 | 
 59 |   if args.aws:
 60 |     ncluster.set_backend('aws')
 61 | 
 62 |   script = os.path.basename(__file__)
 63 |   assert script in os.listdir('.')
 64 |   job = ncluster.make_job(install_script='pip install ray',
 65 |                           image_name=args.image,
 66 |                           instance_type='c5.large',
 67 |                           num_tasks=2)
 68 |   job.upload(script)
 69 |   job.run('export RAY_USE_XRAY=1')
 70 |   job.run('ray stop')
 71 | 
 72 |   # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
 73 |   ps_resource = """--resources='{"ps": 1}'"""
 74 |   worker_resource = """--resources='{"worker": 1}'"""
 75 |   ps, worker = job.tasks
 76 |   ps.run(f"ray start --head {ps_resource} --redis-port=6379")
 77 |   worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
 78 |   worker.run(f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
 79 | 
 80 | 
 81 | def run_driver():
 82 |   ray.init(redis_address=args.ip)
 83 | 
 84 |   worker = Worker.remote()
 85 |   ps = ParameterServer.remote()
 86 | 
 87 |   for iteration in range(args.iters):
 88 |     start_time = time.time()
 89 |     grads = worker.compute_gradients.remote()
 90 |     result = ps.assign_add.remote(grads)
 91 |     result = ray.get(result)[0]
 92 |     elapsed_time = time.time() - start_time
 93 |     rate = args.size_mb / elapsed_time
 94 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (result, args.iters, args.size_mb, elapsed_time * 1000, rate))
 95 | 
 96 | 
 97 | def main():
 98 |   if args.role == 'launcher':
 99 |     run_launcher()
100 |   elif args.role == 'driver':
101 |     run_driver()
102 |   else:
103 |     assert False, f"Unknown role {args.role}, must be laucher/driver"
104 | 
105 | 
106 | if __name__ == '__main__':
107 |   main()
108 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter     # for notebook.auth.passwd
2 | tensorflow
3 | torch
4 | ray
5 | 


--------------------------------------------------------------------------------
/examples/simple_job.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import time
 3 | 
 4 | def main():
 5 |   ncluster.set_backend('local')
 6 | 
 7 |   job = ncluster.make_job(num_tasks=2)
 8 | 
 9 |   start_time = time.time()
10 |   job.run('sleep 1')
11 |   print(f"waited for {time.time()-start_time} seconds")
12 | 
13 | if __name__ == '__main__':
14 |   main()
15 | 


--------------------------------------------------------------------------------
/examples/simple_task.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import ncluster
3 | 
4 | # allocate default machine type and default image
5 | task = ncluster.make_task()
6 | output = task.run('ifconfig')
7 | print(f"Task ifconfig returned {output}")
8 | 


--------------------------------------------------------------------------------
/examples/simple_tf.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | import sys
 3 | 
 4 | if not sys.argv[1:]:
 5 |   import ncluster
 6 |   task = ncluster.make_task(instance_type='t3.micro')
 7 |   task.upload(__file__)
 8 |   task.run('pip install tensorflow')
 9 |   task.run(f'python {__file__} worker')
10 | elif sys.argv[1] == 'worker':
11 |   import tensorflow as tf
12 |   import os
13 |   sess = tf.Session()
14 |   ones = tf.ones((1000,1000))
15 |   result = sess.run(tf.matmul(ones, ones))
16 |   print(f"matmul gave {result.sum()}")
17 |   os.system('sudo shutdown -h -P 10')  # shut down the instance in 10 mins
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/tf_adder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark. Create sender/receiver tasks and add arrays from sender tasks to variable on receiver.
  6 | 
  7 | To run locally:
  8 | ./tf_adder.py
  9 | tmux a -t 0
 10 | 
 11 | Should see something like this
 12 | ```
 13 | 089/100 added 100 MBs in 114.9 ms: 1114.36 MB/second
 14 | 090/100 added 100 MBs in 113.4 ms: 1128.61 MB/second
 15 | 091/100 added 100 MBs in 113.4 ms: 1128.60 MB/second
 16 | ```
 17 | 
 18 | 
 19 | To run on AWS
 20 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 21 | ./tf_adder.py --aws
 22 | nconnect 0.tf_adder
 23 | 
 24 | Should see something like this with t3.large instances
 25 | ```
 26 | 089/100 added 100 MBs in 253.8 ms: 504.27 MB/second
 27 | 090/100 added 100 MBs in 252.6 ms: 506.63 MB/second
 28 | 091/100 added 100 MBs in 255.0 ms: 501.92 MB/second
 29 | ```
 30 | 
 31 | """
 32 | 
 33 | import argparse
 34 | import json
 35 | import os
 36 | import tensorflow as tf
 37 | import time
 38 | 
 39 | parser = argparse.ArgumentParser()
 40 | parser.add_argument("--aws", action="store_true", help="enable to run on AWS")
 41 | parser.add_argument("--iters", default=20, type=int, help="Maximum number of additions")
 42 | parser.add_argument("--data-mb", default=100, type=int, help="size of vector in MBs")
 43 | parser.add_argument('--image',
 44 |                     default='Deep Learning AMI (Ubuntu) Version 15.0')
 45 | 
 46 | # internal flags
 47 | parser.add_argument('--role', default='launcher', type=str)
 48 | parser.add_argument("--sender-ip", default='127.0.0.1')
 49 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 50 | args = parser.parse_args()
 51 | 
 52 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 53 |                 'receiver': [args.receiver_ip + ':32301']}
 54 | 
 55 | 
 56 | def _launch_server(role):
 57 |   os.environ['TF_CONFIG'] = json.dumps(
 58 |     {'cluster': cluster_spec,
 59 |      'task': {'type': role, 'index': 0}})
 60 |   config = tf.estimator.RunConfig()
 61 |   return tf.train.Server(config.cluster_spec,
 62 |                          job_name=config.task_type,
 63 |                          task_index=config.task_id)
 64 | 
 65 | 
 66 | def run_launcher():
 67 |   import ncluster
 68 |   if args.aws:
 69 |     ncluster.set_backend('aws')
 70 | 
 71 |   job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
 72 |   job.upload(__file__)
 73 |   
 74 |   sender, receiver = job.tasks
 75 |   if ncluster.get_backend() == 'aws':
 76 |     # on AWS probably running in conda DLAMI, switch into TF-enabled env
 77 |     job.run('source activate tensorflow_p36')
 78 | 
 79 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
 80 |   receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
 81 |                non_blocking=True)
 82 |   sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
 83 | 
 84 | 
 85 | def run_receiver():
 86 |   server = _launch_server('receiver')
 87 |   time.sleep(365 * 24 * 3600)
 88 |   del server
 89 | 
 90 | 
 91 | def run_sender():
 92 |   param_size = 250 * 1000 * args.data_mb  # 1MB is 250k integers
 93 |   with tf.device('/job:chief/task:0'):
 94 |     grads = tf.fill([param_size], 1.)
 95 | 
 96 |   with tf.device('/job:receiver/task:0'):
 97 |     params = tf.Variable(tf.ones([param_size]))
 98 |     add_op = params.assign_add(grads).op
 99 | 
100 |   server = _launch_server('chief')
101 |   sess = tf.Session(server.target)
102 | 
103 |   sess.run(tf.global_variables_initializer())
104 | 
105 |   for i in range(args.iters):
106 |     start_time = time.time()
107 |     sess.run(add_op)
108 |     elapsed_time = time.time() - start_time
109 |     rate = args.data_mb / elapsed_time
110 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate))
111 | 
112 | 
113 | def main():
114 |   # run local benchmark in launcher and launch service
115 |   if args.role == "launcher":
116 |     run_launcher()
117 |   elif args.role == "sender":
118 |     run_sender()
119 |   elif args.role == "receiver":
120 |     run_receiver()
121 |   else:
122 |     assert False, 'unknown role'
123 | 
124 | 
125 | if __name__ == '__main__':
126 |   main()
127 | 


--------------------------------------------------------------------------------
/examples/tf_adder_tb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | TensorFlow distributed benchmark + TensorBoard. Create sender/receiver tasks and add arrays from sender tasks to
  6 | variable on receiver.
  7 | 
  8 | To run locally:
  9 | ./tf_adder_tb.py
 10 | 
 11 | Should see something like this
 12 | ```
 13 | ...
 14 | Benchmark done, tensorboard at http://127.0.0.1:6006
 15 | ```
 16 | 
 17 | 
 18 | To run on AWS
 19 | aws configure # or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_DEFAULT_REGION
 20 | 
 21 | ./tf_adder_tb.py --aws
 22 | 
 23 | After a minute should see something like this
 24 | 
 25 | ...
 26 | Benchmark done, tensorboard at http://35.173.134.87:6006
 27 | """
 28 | 
 29 | import argparse
 30 | import json
 31 | import os
 32 | import tensorflow as tf
 33 | import time
 34 | 
 35 | parser = argparse.ArgumentParser()
 36 | parser.add_argument('--role', default='launcher', type=str)
 37 | parser.add_argument("--iters", default=20, help="Maximum number of additions")
 38 | parser.add_argument("--data-mb", default=128, help="size of vector in MBs")
 39 | parser.add_argument("--sender-ip", default='127.0.0.1')
 40 | parser.add_argument("--receiver-ip", default='127.0.0.1')
 41 | parser.add_argument("--logdir", help='logging directory')
 42 | parser.add_argument("--aws", action='store_true')
 43 | parser.add_argument('--image', default='Deep Learning AMI (Amazon Linux) Version 13.0')
 44 | args = parser.parse_args()
 45 | 
 46 | cluster_spec = {'chief': [args.sender_ip + ':32300'],
 47 |                 'receiver': [args.receiver_ip + ':32301']}
 48 | 
 49 | 
 50 | def _launch_server(role):
 51 |   os.environ['TF_CONFIG'] = json.dumps(
 52 |     {'cluster': cluster_spec,
 53 |      'task': {'type': role, 'index': 0}})
 54 |   config = tf.estimator.RunConfig()
 55 |   return tf.train.Server(config.cluster_spec,
 56 |                          job_name=config.task_type,
 57 |                          task_index=config.task_id)
 58 | 
 59 | 
 60 | def run_launcher():
 61 |   import ncluster
 62 | 
 63 |   if args.aws:
 64 |     ncluster.set_backend('aws')
 65 |   job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image)
 66 |   job.upload(__file__)
 67 |   this_file = os.path.basename(__file__)
 68 | 
 69 |   sender, receiver = job.tasks
 70 |   if ncluster.get_backend() == 'aws':
 71 |     # on AWS probably are running in DLAMI, switch into TF-enabled env
 72 |     job.run('source activate tensorflow_p36')
 73 | 
 74 |   ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
 75 |   job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True)
 76 |   job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
 77 |   job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True)
 78 |   print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
 79 | 
 80 | 
 81 | def run_receiver():
 82 |   server = _launch_server('receiver')
 83 |   time.sleep(365 * 24 * 3600)
 84 |   del server
 85 | 
 86 | 
 87 | def run_sender():
 88 |   summary_writer = tf.summary.FileWriter(args.logdir)
 89 | 
 90 |   param_size = 250 * 1000 * args.data_mb  # 1MB is 250k integers
 91 |   with tf.device('/job:chief/task:0'):
 92 |     grads = tf.fill([param_size], 1.)
 93 | 
 94 |   with tf.device('/job:receiver/task:0'):
 95 |     params = tf.Variable(tf.ones([param_size]))
 96 |     add_op = params.assign_add(grads).op
 97 | 
 98 |   server = _launch_server('chief')
 99 |   sess = tf.Session(server.target)
100 | 
101 |   sess.run(tf.global_variables_initializer())
102 | 
103 |   for i in range(args.iters):
104 |     start_time = time.time()
105 |     sess.run(add_op)
106 |     elapsed_time = time.time() - start_time
107 |     rate = args.data_mb / elapsed_time
108 |     print('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.data_mb, elapsed_time * 1000, rate))
109 |     summary = tf.Summary()
110 |     summary.value.add(tag='time_ms', simple_value=elapsed_time * 1000)
111 |     summary_writer.add_summary(summary, i)
112 | 
113 |   summary_writer.close()
114 | 
115 | 
116 | def main():
117 |   # run local benchmark in launcher and launch service
118 |   if args.role == "launcher":
119 |     run_launcher()
120 |   elif args.role == "sender":
121 |     run_sender()
122 |   elif args.role == "receiver":
123 |     run_receiver()
124 |   else:
125 |     assert False, 'unknown role'
126 | 
127 | 
128 | if __name__ == '__main__':
129 |   main()
130 | 


--------------------------------------------------------------------------------
/ncluster/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import aws_backend
 2 | from . import aws_util
 3 | from . import util
 4 | 
 5 | from .aws_backend import make_task
 6 | from .aws_backend import make_job
 7 | 
 8 | # for type annotations
 9 | from .aws_backend import Job
10 | from .aws_backend import Task
11 | 
12 | from .aws_util import running_on_aws
13 | from .aws_util import get_zone
14 | from .aws_util import get_region
15 | 
16 | from ._version import __version__
17 | 
18 | from .aws_backend import make_job
19 | from .aws_backend import make_task
20 | from .aws_backend import deprecated_set_backend as set_backend
21 | 
22 | from . import aws_util as u
23 | 
24 | from . import ncluster_globals
25 | 
26 | 
27 | print(f"ncluster version {__version__}")
28 | 
29 | if not util.is_set('NCLUSTER_DISABLE_PDB_HANDLER') and not util.is_set('NCLUSTER_RUNNING_UNDER_CIRCLECI'):
30 |   util.install_pdb_handler()  # CTRL+\ drops into pdb
31 | 
32 | 


--------------------------------------------------------------------------------
/ncluster/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.91'
2 | 


--------------------------------------------------------------------------------
/ncluster/local_backend.py:
--------------------------------------------------------------------------------
  1 | """EXPERIMENTAL local backend which mirrors aws_backend API. Useful for debugging
  2 | 
  3 | Not thread-safe.
  4 | """
  5 | 
  6 | import glob
  7 | import os
  8 | import shlex
  9 | import socket
 10 | import time
 11 | from typing import List, Tuple
 12 | 
 13 | from ncluster import ncluster_globals
 14 | from . import util
 15 | 
 16 | TASKDIR_ROOT = '/tmp/ncluster/task'
 17 | SCRATCH_ROOT = '/tmp/ncluster/scratch'
 18 | # DEFAULT_LOGDIR_ROOT = '/ncluster/runs'
 19 | 
 20 | 
 21 | # todo: tmux session names are backwards from AWS job names (runname-jobname)
 22 | # TODO: add kwargs so that tmux backend can be drop-in replacement
 23 | 
 24 | 
 25 | # TODO: rename extra_kwargs to kwargs everywhere
 26 | class Task:
 27 |   """Local tasks interact with tmux session where session name is derived
 28 |   from job name, and window names are task ids."""
 29 |   tmux_window_id: int
 30 |   tmux_available_window_ids: List[int]
 31 | 
 32 |   def __init__(self, name, *, tmux_session, install_script='', job=None,
 33 |                **kwargs):
 34 | 
 35 |     self.last_status = None
 36 |     self.homedir = os.environ['HOME']
 37 |     self._cmd_fn = None
 38 |     self._cmd = None
 39 |     self._status_fn = None  # location of output of last status
 40 |     self._out_fn = None
 41 | 
 42 |     self._can_run = False
 43 |     self.tmux_session = tmux_session
 44 |     self.tmux_window_id = 0
 45 |     self.tmux_available_window_ids = [0]
 46 | 
 47 |     self.name = name
 48 |     self.install_script = install_script
 49 |     self.job = job
 50 |     self.kwargs = kwargs
 51 | 
 52 |     # local servers sometimes listen only on localhost (TensorBoard), and sometimes only on
 53 |     # externally assigned ip address from gethostbyname (Ray), must choose one, so use the localhost for TB compatibility
 54 |     # https://github.com/ray-project/ray/issues/1677
 55 |     self.public_ip = socket.gethostbyname(socket.gethostname())
 56 |     #  self.public_ip = '127.0.0.1'
 57 |     self.ip = self.public_ip
 58 | 
 59 |     self.connect_instructions = 'tmux a -t ' + self.tmux_session
 60 | 
 61 |     # task current dir
 62 |     print('name is', name)
 63 |     # tmpdir = f"{util.reverse_taskname(name)}.{os.getpid()}.{util.now_micros()}"
 64 |     launch_id = util.random_id()
 65 |     self.taskdir = f"{TASKDIR_ROOT}/{name}-{launch_id}"
 66 |     self.local_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}"
 67 |     self.remote_scratch = f"{SCRATCH_ROOT}/{name}-{launch_id}"
 68 | 
 69 |     self.log(f"Creating taskdir {self.taskdir}")
 70 |     self._run_raw('mkdir -p ' + self.taskdir)
 71 | 
 72 |     self.log(f"Creating scratch {self.local_scratch}")
 73 |     self._run_raw('rm -Rf ' + self.local_scratch)
 74 |     self._run_raw('mkdir -p ' + self.local_scratch)
 75 |     self._run_raw('mkdir -p ' + self.remote_scratch)
 76 |     self.run_counter = 0
 77 | 
 78 |     self._cwd = self.taskdir
 79 |     self._can_run = True
 80 |     self.run('cd ' + self.taskdir)
 81 | 
 82 |     print("Running install script " + install_script)
 83 |     self.install_script = install_script
 84 |     for line in install_script.split('\n'):
 85 |       self.run(line)
 86 | 
 87 |   def run(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs):
 88 | 
 89 |     if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'):
 90 |       # HACK
 91 |       if not util.is_bash_builtin(cmd) or True:
 92 |         return self._run_with_output_on_failure(cmd, non_blocking, ignore_errors, **_kwargs)
 93 |       else:
 94 |         self.log("Found bash built-in, using regular run")
 95 | 
 96 |     if not self._can_run:
 97 |       assert False, "Using .run before initialization finished"
 98 |     if '\n' in cmd:
 99 |       cmds = cmd.split('\n')
100 |       self.log(
101 |         f"Running {len(cmds)} commands at once, returning status of last")
102 |       status = -1
103 |       for subcmd in cmds:
104 |         status = self.run(subcmd)
105 |       return status
106 | 
107 |     cmd = cmd.strip()
108 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
109 |       return -1
110 |     self.run_counter += 1
111 |     self.log("tmux> %s", cmd)
112 | 
113 |     self._cmd = cmd
114 |     self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd'
115 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
116 |     assert not os.path.exists(self._status_fn)
117 | 
118 |     cmd = util.shell_strip_comment(cmd)
119 |     # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
120 | 
121 |     self.write(self._cmd_fn, cmd + '\n')
122 |     modified_cmd = f'{cmd} ; echo $? > {self._status_fn}'
123 |     modified_cmd = shlex.quote(modified_cmd)
124 | 
125 |     tmux_window = self.tmux_session+':'+str(self.tmux_window_id)
126 |     tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter'
127 |     self._run_raw(tmux_cmd, ignore_errors=ignore_errors)
128 |     if non_blocking:
129 |       return 0
130 | 
131 |     if not self.wait_for_file(self._status_fn, max_wait_sec=60):
132 |       self.log(f"Retrying waiting for {self._status_fn}")
133 |     while not self.exists(self._status_fn):
134 |       self.log(f"Still waiting for {cmd}")
135 |       self.wait_for_file(self._status_fn, max_wait_sec=60)
136 |     contents = self.read(self._status_fn)
137 | 
138 |     # if empty wait a bit to allow for race condition
139 |     if len(contents) == 0:
140 |       time.sleep(0.01)
141 |     status = int(open(self._status_fn).read().strip())
142 |     self.last_status = status
143 | 
144 |     if status != 0:
145 |       if not ignore_errors:
146 |         raise RuntimeError(f"Command {cmd} returned status {status}")
147 |       else:
148 |         self.log(f"Warning: command {cmd} returned status {status}")
149 | 
150 |     return status
151 | 
152 |   def join(self, ignore_errors=False):
153 |     """Waits until last executed command completed."""
154 |     assert self._status_fn, "Asked to join a task which hasn't had any commands executed on it"
155 |     check_interval = 0.2
156 |     status_fn = self._status_fn
157 |     if not self.wait_for_file(status_fn, max_wait_sec=30):
158 |       self.log(f"Retrying waiting for {status_fn}")
159 |     while not self.exists(status_fn):
160 |       self.log(f"Still waiting for {self._cmd}")
161 |       self.wait_for_file(status_fn, max_wait_sec=30)
162 |     contents = self.read(status_fn)
163 | 
164 |     # if empty wait a bit to allow for race condition
165 |     if len(contents) == 0:
166 |       time.sleep(check_interval)
167 |       contents = self.read(status_fn)
168 |     status = int(contents.strip())
169 |     self.last_status = status
170 | 
171 |     if status != 0:
172 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
173 |       if util.is_set('NCLUSTER_RUN_WITH_OUTPUT_ON_FAILURE'):
174 |         self.log(
175 |           f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
176 |         self.log(f"\n{'*'*80}\nEnd failing output")
177 |       if not ignore_errors:
178 |         raise RuntimeError(f"Command {self._cmd} returned status {status}")
179 |       else:
180 |         self.log(f"Warning: command {self._cmd} returned status {status}")
181 | 
182 |     return status
183 | 
184 |   def switch_window(self, window_id: int):
185 |     """
186 |     Switches currently active tmux window for given task. 0 is the default window
187 |     Args:
188 |       window_id: integer id of tmux window to use
189 |     """
190 | 
191 |     # windows are numbered sequentially 0, 1, 2, ...
192 |     # create any missing windows and make them point to the same directory
193 |     if window_id not in self.tmux_available_window_ids:
194 |       for i in range(max(self.tmux_available_window_ids)+1, window_id+1):
195 |         self._run_raw(f'tmux new-window -t {self.tmux_session} -d')
196 | 
197 |         tmux_window = self.tmux_session + ':' + str(i)
198 |         cmd = shlex.quote(f'cd {self.taskdir}')
199 |         tmux_cmd = f'tmux send-keys -t {tmux_window} {cmd} Enter'
200 |         self._run_raw(tmux_cmd)
201 |         self.tmux_available_window_ids.append(i)
202 | 
203 |     self.tmux_window_id = window_id
204 | 
205 |   # This is a future "run" command, will become "run" once all cases are checked
206 |   def _run_with_output_on_failure(self, cmd, non_blocking=False, ignore_errors=False, **_kwargs) -> str:
207 |     if not self._can_run:
208 |       assert False, "Using .run before initialization finished"
209 |     if '\n' in cmd:
210 |       cmds = cmd.split('\n')
211 |       self.log(
212 |         f"Running {len(cmds)} commands at once, returning status of last")
213 |       status = -1
214 |       for subcmd in cmds:
215 |         status = self.run(subcmd)
216 |       return status
217 | 
218 |     cmd = cmd.strip()
219 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
220 |       return ''
221 |     self.run_counter += 1
222 |     self.log("tmux> %s", cmd)
223 | 
224 |     self._cmd = cmd
225 |     self._cmd_fn = f'{self.local_scratch}/{self.run_counter}.cmd'
226 |     self._status_fn = f'{self.remote_scratch}/{self.run_counter}.status'
227 |     self._out_fn = f'{self.remote_scratch}/{self.run_counter}.out'
228 |     assert not os.path.exists(self._status_fn)
229 | 
230 |     cmd = util.shell_strip_comment(cmd)
231 |     # assert '&' not in cmd, f"cmd {cmd} contains &, that breaks things"
232 | 
233 |     self.write(self._cmd_fn, cmd + '\n')
234 |     #  modified_cmd = f'{cmd} ; echo $? > {self._status_fn}'
235 |     modified_cmd = f'{cmd} > >(tee -a {self._out_fn}) 2> >(tee -a {self._out_fn} >&2); echo $? > {self._status_fn}'
236 |     modified_cmd = shlex.quote(modified_cmd)
237 | 
238 |     tmux_window = self.tmux_session+':'+str(self.tmux_window_id)
239 |     tmux_cmd = f'tmux send-keys -t {tmux_window} {modified_cmd} Enter'
240 |     self._run_raw(tmux_cmd)
241 |     if non_blocking:
242 |       return ''
243 | 
244 |     if not self.wait_for_file(self._status_fn, max_wait_sec=60):
245 |       self.log(f"Retrying waiting for {self._status_fn}")
246 |     while not self.exists(self._status_fn):
247 |       self.log(f"Still waiting for {cmd}")
248 |       self.wait_for_file(self._status_fn, max_wait_sec=60)
249 |     contents = self.read(self._status_fn)
250 | 
251 |     # if empty wait a bit to allow for race condition
252 |     if len(contents) == 0:
253 |       time.sleep(0.01)
254 |     status = int(open(self._status_fn).read().strip())
255 |     self.last_status = status
256 | 
257 |     if status != 0:
258 |       extra_msg = '(ignoring error)' if ignore_errors else '(failing)'
259 |       self.log(
260 |         f"Start failing output {extra_msg}: \n{'*'*80}\n\n '{self.read(self._out_fn)}'")
261 |       self.log(f"\n{'*'*80}\nEnd failing output")
262 |       if not ignore_errors:
263 |         raise RuntimeError(f"Command {cmd} returned status {status}")
264 |       else:
265 |         self.log(f"Warning: command {cmd} returned status {status}")
266 | 
267 |     return self.read(self._out_fn)
268 | 
269 |   def _run_raw(self, cmd, ignore_errors=False):
270 |     """Runs command directly, skipping tmux interface"""
271 |     # TODO: capture stdout/stderr for feature parity with aws_backend
272 |     result = os.system(cmd)
273 |     if result != 0:
274 |       if ignore_errors:
275 |         self.log(f"command ({cmd}) failed.")
276 |         assert False, "_run_raw failed"
277 | 
278 |   def rsync(self, local_fn, remote_fn=None):
279 |     self.upload(local_fn, remote_fn)
280 |   
281 |   def upload(self, local_fn, remote_fn=None, dont_overwrite=False):
282 |     """Uploads file to remote instance. If location not specified, dumps it
283 |     into default directory. Creates missing directories in path name."""
284 | 
285 |     # support wildcard through glob
286 |     if '*' in local_fn:
287 |       for local_subfn in glob.glob(local_fn):
288 |         self.upload(local_subfn)
289 |       return
290 | 
291 |     if remote_fn is None:
292 |       remote_fn = os.path.basename(local_fn)
293 | 
294 |     if dont_overwrite and self.exists(remote_fn):
295 |       self.log("Remote file %s exists, skipping" % (remote_fn,))
296 |       return
297 | 
298 |     if not remote_fn.startswith('/'):
299 |       remote_fn = self.taskdir + '/' + remote_fn
300 | 
301 |     remote_fn = remote_fn.replace('~', self.homedir)
302 |     self.log('uploading ' + local_fn + ' to ' + remote_fn)
303 | 
304 |     local_fn = os.path.abspath(local_fn)
305 |     self._run_raw("cp -R %s %s" % (local_fn, remote_fn))
306 | 
307 |   def download(self, remote_fn, local_fn='.'):
308 |     if local_fn == '.':
309 |       local_fn = self._cwd
310 |     #    self.log("downloading %s to %s" % (remote_fn, local_fn))
311 |     if not remote_fn.startswith('/'):
312 |       remote_fn = self._cwd + '/' + remote_fn
313 |     if self.exists(remote_fn):
314 |       os.system(f'cp {remote_fn} {local_fn}')
315 |     else:
316 |       raise RuntimeError(f"No such file {remote_fn}")
317 | 
318 |   @staticmethod
319 |   def exists(remote_fn):
320 |     return os.path.exists(remote_fn)
321 | 
322 |   def read(self, remote_fn):
323 |     tmp_fn = self.local_scratch + '/' + str(util.now_micros())
324 |     self.download(remote_fn, tmp_fn)
325 |     return open(tmp_fn).read()
326 | 
327 |   def write(self, remote_fn, contents):
328 |     def make_temp_fn():
329 |       """Returns temporary filename for this task."""
330 |       return self.local_scratch + '/write.' + str(util.now_micros())
331 | 
332 |     tmp_fn = make_temp_fn()
333 |     open(tmp_fn, 'w').write(contents)
334 |     self.upload(tmp_fn, remote_fn)
335 | 
336 |   # don't include file streaming for now
337 |   # the issue is that file streaming by default turns on 4K buffering, which makes
338 |   # streaming a lot less useful. Similar buffering is turned on for piping commands
339 |   # https://unix.stackexchange.com/questions/25372/turn-off-buffering-in-pipe
340 |   # def file_stream(self, fn: str) -> None:
341 |   #   #    if not fn.startswith('/'):
342 |   #   #      fn = self.taskdir + '/' + fn
343 |   #
344 |   #   if not os.path.exists(fn):
345 |   #     os.system('mkdir -p ' + os.path.dirname(os.path.abspath(fn)))
346 |   #     os.system('touch ' + fn)
347 |   #
348 |   #   p = subprocess.Popen(['tail', '-f', fn], stdout=subprocess.PIPE)
349 |   #
350 |   #   for line in iter(p.stdout.readline, ''):
351 |   #     sys.stdout.write(line.decode('ascii', errors='ignore'))
352 | 
353 |   @property
354 |   def logdir(self):
355 |     """Returns logging directory, creating one if necessary. See "Logdir" section  of design doc on naming convention."""
356 | 
357 |     run_name = ncluster_globals.get_run_for_task(self)
358 |     logdir = ncluster_globals.get_logdir(run_name)
359 |     if logdir:
360 |       return logdir
361 | 
362 |     # create logdir. Only single task in a group creates the logdir
363 |     if ncluster_globals.is_chief(self, run_name):
364 |       chief = self
365 |     else:
366 |       chief = ncluster_globals.get_chief(run_name)
367 | 
368 |     chief.setup_logdir()
369 |     return ncluster_globals.get_logdir(run_name)
370 |    # release lock
371 | 
372 |   def setup_logdir(self):
373 |     # todo: locking on logdir creation
374 | 
375 |     """Create logdir for task/job/run. No-op if the task is not chief (0'th task of 0'th job of run)
376 |     """
377 |     run_name = ncluster_globals.get_run_for_task(self)
378 |     self.log("Creating logdir for run "+run_name)
379 |     logdir_root = ncluster_globals.LOGDIR_ROOT
380 |     assert logdir_root, "LOGDIR_ROOT not set, make sure you have called ncluster.set_backend()"
381 | 
382 |     self.run(f'mkdir -p {logdir_root}')
383 |     find_command = f'find {logdir_root} -maxdepth 1 -type d'
384 | 
385 |     stdout, stderr = self.run_with_output(find_command)
386 |     logdir = f"{logdir_root}/{run_name}"
387 | 
388 |     counter = 0
389 |     while logdir in stdout:
390 |       counter += 1
391 |       new_logdir = f'{logdir_root}/{run_name}.{counter:02d}'
392 |       self.log(f'Warning, logdir {logdir} exists, deduping to {new_logdir}')
393 |       logdir = new_logdir
394 |     self.run(f'mkdir -p {logdir}')
395 | 
396 |     ncluster_globals.set_logdir(run_name, logdir)
397 |     return logdir
398 | 
399 |   def log(self, message, *args):
400 |     """Log to launcher console."""
401 |     if args:
402 |       message %= args
403 | 
404 |     print(f"{util.current_timestamp()} {self.name}: {message}")
405 | 
406 |   def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365,
407 |                     check_interval: float = 0.02) -> bool:
408 |     """
409 |     Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec
410 |     Args:
411 |       fn: filename on task machine
412 |       max_wait_sec: how long to wait in seconds
413 |       check_interval: how often to check in seconds
414 |     Returns:
415 |       False if waiting was was cut short by max_wait_sec limit, True otherwise
416 |     """
417 |     #    print("Waiting for file", fn)
418 |     start_time = time.time()
419 |     while True:
420 |       if time.time() - start_time > max_wait_sec:
421 |         util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}")
422 |         return False
423 |       if not self.exists(fn):
424 |         time.sleep(check_interval)
425 |         continue
426 |       else:
427 |         break
428 |     return True
429 | 
430 |   # TODO: reuse regular run
431 |   def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \
432 |           Tuple[str, str]:
433 |     """
434 | 
435 |     Args:
436 |       cmd: single line shell command to run
437 |       non_blocking (bool): if True, does not wait for command to finish
438 |       ignore_errors: if True, will succeed even if command failed
439 | 
440 |     Returns:
441 |       Contents of stdout/stderr as strings.
442 |     Raises
443 |       RuntimeException: if command produced non-0 returncode
444 | 
445 |     """
446 | 
447 |     assert '\n' not in cmd, "Do not support multi-line commands"
448 |     cmd: str = cmd.strip()
449 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
450 |       return '', ''
451 | 
452 |     stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout"
453 |     stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr"
454 |     cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}"
455 | 
456 |     assert not non_blocking, "Getting output doesn't work with non_blocking"
457 |     status = self.run(cmd2, False, ignore_errors=True)
458 |     stdout = self.read(stdout_fn)
459 |     stderr = self.read(stderr_fn)
460 | 
461 |     if self.last_status > 0:
462 |       self.log(f"Warning: command '{cmd}' returned {status},"
463 |                f" stdout was '{stdout}' stderr was '{stderr}'")
464 |       if not ignore_errors:
465 |         raise RuntimeError(f"Warning: command '{cmd}' returned {status},"
466 |                            f" stdout was '{stdout}' stderr was '{stderr}'")
467 | 
468 |     return stdout, stderr
469 | 
470 | 
471 | def make_task(name='',
472 |               run_name='',
473 |               **kwargs) -> Task:
474 |   """Create task, also create dummy run if not specified."""
475 |   ncluster_globals.task_launched = True
476 | 
477 |   name = ncluster_globals.auto_assign_task_name_if_needed(name)
478 | 
479 |   # tmux can't use . for session names
480 |   tmux_session = name.replace('.', '=')
481 |   tmux_window_id = 0
482 |   util.log(f'killing session {tmux_session}')
483 | 
484 |   if not util.is_set("NCLUSTER_NOKILL_TMUX"):
485 |     os.system(f'tmux kill-session -t {tmux_session}')
486 |   os.system(f'tmux new-session -s {tmux_session} -n {tmux_window_id} -d')
487 | 
488 |   task = Task(name,
489 |               tmux_session=tmux_session,  # propagate optional args
490 |               run_name=run_name,
491 |               **kwargs)
492 |   ncluster_globals.register_task(task, run_name)
493 |   return task
494 | 


--------------------------------------------------------------------------------
/ncluster/ncluster_cloud_setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Creates resources.
  4 | # To run standalone:
  5 | # python -m ncluster.ncluster_cloud_setup
  6 | #
  7 | # This script creates VPC/security group/keypair if not already present
  8 | 
  9 | import os
 10 | import sys
 11 | import time
 12 | from typing import Tuple, Any, Optional
 13 | 
 14 | from boto3_type_annotations.ec2 import SecurityGroup
 15 | 
 16 | from ncluster import aws_util as u
 17 | from ncluster import util
 18 | 
 19 | DRYRUN = False
 20 | DEBUG = True
 21 | 
 22 | # Names of Amazon resources that are created. These settings are fixed across
 23 | # all runs, and correspond to resources created once per user per region.
 24 | 
 25 | NFS_PORT = 2049
 26 | PUBLIC_TCP_RANGES = [
 27 |   22,  # ssh
 28 |   NFS_PORT,  # NFS port NFS peering between security groups
 29 |   # ipython notebook ports
 30 |   (8888, 8899),
 31 |   # redis port
 32 |   6379,
 33 |   # tensorboard ports
 34 |   (6006, 6016)
 35 | ]
 36 | 
 37 | PUBLIC_UDP_RANGES = [NFS_PORT, (60000, 61000)]  # mosh ports
 38 | 
 39 | 
 40 | def network_setup() -> Tuple[Any, Any]:
 41 |   """Creates VPC if it doesn't already exists, configures it for public
 42 |   internet access, returns vpc, subnet, security_group"""
 43 | 
 44 |   # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6
 45 | 
 46 |   ec2 = u.get_ec2_resource()
 47 |   client = u.get_ec2_client()
 48 |   existing_vpcs = u.get_vpc_dict()
 49 |   zones = u.get_zones()
 50 | 
 51 |   # create VPC from scratch. Remove this if default VPC works well enough.
 52 |   create_non_default_vpc = False
 53 | 
 54 |   if create_non_default_vpc:
 55 |     vpc_name = u.get_vpc_name()
 56 |     if u.get_vpc_name() in existing_vpcs:
 57 |       print("Reusing VPC " + vpc_name)
 58 |       vpc = existing_vpcs[vpc_name]
 59 |       subnets = list(vpc.subnets.all())
 60 |       assert len(subnets) == len(
 61 |         zones), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % (
 62 |         len(subnets), len(zones))
 63 | 
 64 |     else:
 65 |       print("Creating VPC " + vpc_name)
 66 |       vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16')
 67 | 
 68 |       # enable DNS on the VPC
 69 |       response = vpc.modify_attribute(EnableDnsHostnames={"Value": True})
 70 |       assert u.is_good_response(response)
 71 |       response = vpc.modify_attribute(EnableDnsSupport={"Value": True})
 72 |       assert u.is_good_response(response)
 73 | 
 74 |       vpc.create_tags(Tags=u.create_name_tags(vpc_name))
 75 |       vpc.wait_until_available()
 76 | 
 77 |     gateways = u.get_gateway_dict(vpc)
 78 |     gateway_name = u.get_gateway_name()
 79 |     if gateway_name in gateways:
 80 |       print("Reusing gateways " + gateway_name)
 81 |     else:
 82 |       print("Creating internet gateway " + gateway_name)
 83 |       ig = ec2.create_internet_gateway()
 84 |       ig.attach_to_vpc(VpcId=vpc.id)
 85 |       ig.create_tags(Tags=u.create_name_tags(gateway_name))
 86 | 
 87 |       # check that attachment succeeded
 88 |       attach_state = u.extract_attr_for_match(ig.attachments, State=-1,
 89 |                                               VpcId=vpc.id)
 90 |       assert attach_state == 'available', "vpc %s is in state %s" % (vpc.id,
 91 |                                                                      attach_state)
 92 |       route_table = vpc.create_route_table()
 93 |       route_table_name = u.get_route_table_name()
 94 |       route_table.create_tags(Tags=u.create_name_tags(route_table_name))
 95 | 
 96 |       dest_cidr = '0.0.0.0/0'
 97 |       route_table.create_route(
 98 |         DestinationCidrBlock=dest_cidr,
 99 |         GatewayId=ig.id
100 |       )
101 |       # check success
102 |       for route in route_table.routes:
103 |         # result looks like this
104 |         # ec2.Route(route_table_id='rtb-a8b438cf',
105 |         #    destination_cidr_block='0.0.0.0/0')
106 |         if route.destination_cidr_block == dest_cidr:
107 |           break
108 |       else:
109 |         # sometimes get
110 |         #      AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')]
111 |         # TODO: add a wait/retry?
112 |         assert False, "Route for %s not found in %s" % (dest_cidr,
113 |                                                         route_table.routes)
114 | 
115 |       assert len(zones) <= 16  # for cidr/20 to fit into cidr/16
116 |       ip = 0
117 |       for zone in zones:
118 |         cidr_block = '192.168.%d.0/20' % (ip,)
119 |         ip += 16
120 |         print("Creating subnet %s in zone %s" % (cidr_block, zone))
121 |         subnet = vpc.create_subnet(CidrBlock=cidr_block,
122 |                                    AvailabilityZone=zone)
123 |         subnet.create_tags(Tags=[{'Key': 'Name', 'Value': f'{vpc_name}-subnet'},
124 |                                  {'Key': 'Region', 'Value': zone}])
125 |         response = client.modify_subnet_attribute(
126 |           MapPublicIpOnLaunch={'Value': True},
127 |           SubnetId=subnet.id
128 |         )
129 |         assert u.is_good_response(response)
130 |         u.wait_until_available(subnet)
131 |         assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?"
132 | 
133 |         route_table.associate_with_subnet(SubnetId=subnet.id)
134 | 
135 |   #  Setup security group for non-default VPC
136 |   #  existing_security_groups = u.get_security_group_dict()
137 |   #  security_group_nd_name = u.get_security_group_nd_name()
138 |   # if security_group_nd_name in existing_security_groups:
139 |   #   print("Reusing non-default security group " + security_group_nd_name)
140 |   #   security_group_nd = existing_security_groups[security_group_nd_name]
141 |   #   assert security_group_nd.vpc_id == vpc.id, f"Found non-default security group {security_group_nd} " \
142 |   #                                              f"attached to {security_group_nd.vpc_id} but expected {vpc.id}"
143 |   # else:
144 |   #   security_group_nd = create_security_group(security_group_nd_name, vpc.id)
145 | 
146 |   # Setup things on default VPC for zone-agnostic launching
147 |   vpc = u.get_default_vpc()
148 |   if not vpc:
149 |     util.log(f"Creating default VPC for region {u.get_region()}")
150 |     client.create_default_vpc()
151 |   vpc = u.get_default_vpc()
152 |   assert vpc, "Could not create default VPC?"
153 | 
154 |   existing_security_groups = u.get_security_group_dict()
155 |   security_group_name = u.get_security_group_name()
156 |   if security_group_name in existing_security_groups:
157 |     print("Reusing security group " + security_group_name)
158 |     security_group = existing_security_groups[security_group_name]
159 |     assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \
160 |                                             f"attached to {security_group.vpc_id} but expected {vpc.id}"
161 |   else:
162 |     security_group = create_security_group(security_group_name, vpc.id)
163 |     #  Uncomment the following when setting up two VPC's
164 |     #  security_group = create_security_group(security_group_name, vpc.id, security_group_nd)
165 | 
166 |   return vpc, security_group
167 | 
168 | 
169 | def keypair_setup():
170 |   """Creates keypair if necessary, saves private key locally, returns contents
171 |   of private key file."""
172 | 
173 |   os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION)
174 |   keypair_name = u.get_keypair_name()
175 |   keypair = u.get_keypair_dict().get(keypair_name, None)
176 |   keypair_fn = u.get_keypair_fn()
177 |   if keypair:
178 |     print("Reusing keypair " + keypair_name)
179 |     # check that local pem file exists and is readable
180 |     assert os.path.exists(
181 |       keypair_fn), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % (
182 |       keypair_name, keypair_fn, keypair_name)
183 |     keypair_contents = open(keypair_fn).read()
184 |     assert len(keypair_contents) > 0
185 |   else:
186 |     print("Creating keypair " + keypair_name)
187 |     ec2 = u.get_ec2_resource()
188 |     assert not os.path.exists(
189 |       keypair_fn), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % (
190 |       keypair_fn)
191 |     keypair = ec2.create_key_pair(KeyName=keypair_name)
192 | 
193 |     open(keypair_fn, 'w').write(keypair.key_material)
194 |     os.system('chmod 400 ' + keypair_fn)
195 | 
196 |   return keypair
197 | 
198 | 
199 | def placement_group_setup(group_name):
200 |   """Creates placement_group group if necessary. Returns True if new placement_group
201 |   group was created, False otherwise."""
202 | 
203 |   existing_placement_groups = u.get_placement_group_dict()
204 | 
205 |   group = existing_placement_groups.get(group_name, None)
206 |   if group:
207 |     assert group.state == 'available'
208 |     assert group.strategy == 'cluster'
209 |     print("Reusing group ", group.name)
210 |     return group
211 | 
212 |   print("Creating group " + group_name)
213 |   ec2 = u.get_ec2_resource()
214 |   group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster')
215 |   return group
216 | 
217 | 
218 | def create_security_group(security_group_name: str, vpc_id: str, other_group: Optional[SecurityGroup] = None):
219 |   """Creates security group with proper ports open. Optionally allows all traffic from other_group"""
220 |   print("Creating security group " + security_group_name)
221 |   ec2 = u.get_ec2_resource()
222 | 
223 |   security_group: SecurityGroup = ec2.create_security_group(
224 |     GroupName=security_group_name, Description=security_group_name,
225 |     VpcId=vpc_id)
226 | 
227 |   security_group.create_tags(Tags=u.create_name_tags(security_group_name))
228 | 
229 |   # allow ICMP access for public ping
230 |   security_group.authorize_ingress(
231 |     CidrIp='0.0.0.0/0',
232 |     IpProtocol='icmp',
233 |     FromPort=-1,
234 |     ToPort=-1
235 |   )
236 | 
237 |   # open public ports
238 |   # always include SSH port which is required for basic functionality
239 |   assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access"
240 |   for port in PUBLIC_TCP_RANGES:
241 |     if util.is_iterable(port):
242 |       assert len(port) == 2
243 |       from_port, to_port = port
244 |     else:
245 |       from_port, to_port = port, port
246 | 
247 |     response = security_group.authorize_ingress(IpProtocol="tcp",
248 |                                                 CidrIp="0.0.0.0/0",
249 |                                                 FromPort=from_port,
250 |                                                 ToPort=to_port)
251 |     assert u.is_good_response(response)
252 | 
253 |   for port in PUBLIC_UDP_RANGES:
254 |     if util.is_iterable(port):
255 |       assert len(port) == 2
256 |       from_port, to_port = port
257 |     else:
258 |       from_port, to_port = port, port
259 | 
260 |     response = security_group.authorize_ingress(IpProtocol="udp",
261 |                                                 CidrIp="0.0.0.0/0",
262 |                                                 FromPort=from_port,
263 |                                                 ToPort=to_port)
264 |     assert u.is_good_response(response)
265 | 
266 |   def authorize_from_group(this_security_group: SecurityGroup, other_security_group: SecurityGroup):
267 |     """Helper function to authorize all traffic from other_group. Can be used to authorized within-group traffic as
268 |     authorize_from_group(group, group)"""
269 | 
270 |     # Authorizing ingress doesn't work with security group names in a non-default VPC,
271 |     # so must use more complicated syntax: https://github.com/boto/boto3/issues/158
272 |     response_ = {}
273 |     for protocol in ['icmp']:
274 |       try:
275 |         rule = {'FromPort': -1,
276 |                 'IpProtocol': protocol,
277 |                 'IpRanges': [],
278 |                 'PrefixListIds': [],
279 |                 'ToPort': -1,
280 |                 'UserIdGroupPairs': [{'GroupId': other_security_group.id}]}
281 |         response_ = this_security_group.authorize_ingress(IpPermissions=[rule])
282 |       except Exception as e:
283 |         if response_['Error']['Code'] == 'InvalidPermission.Duplicate':
284 |           print("Warning, got " + str(e))
285 |         else:
286 |           assert False, "Failed while authorizing icml ingress with " + str(e)
287 | 
288 |     for protocol in ['tcp', 'udp']:
289 |       try:
290 |         rule = {'FromPort': 0,
291 |                 'IpProtocol': protocol,
292 |                 'IpRanges': [],
293 |                 'PrefixListIds': [],
294 |                 'ToPort': 65535,
295 |                 'UserIdGroupPairs': [{'GroupId': other_security_group.id}]}
296 |         response_ = this_security_group.authorize_ingress(IpPermissions=[rule])
297 |       except Exception as e:
298 |         if response_['Error']['Code'] == 'InvalidPermission.Duplicate':
299 |           print("Warning, got " + str(e))
300 |         else:
301 |           assert False, "Failed while authorizing tcp/udp ingress with " + str(e)
302 | 
303 |     # authorize EFA traffic
304 |     user_id = u.get_account_number()
305 |     response = None
306 |     try:
307 |       rule = {
308 |          "IpProtocol": "-1",
309 |          "Ipv6Ranges": [],
310 |          "PrefixListIds": [],
311 |          'UserIdGroupPairs': [{'Description': 'efa', 'GroupId': other_security_group.id, 'UserId': user_id}]
312 |       }
313 |       response_ = this_security_group.authorize_ingress(IpPermissions=[rule])
314 |       assert u.is_good_response(response_), str(response)
315 | 
316 |       rule = {
317 |         "IpProtocol": "-1",
318 |         "PrefixListIds": [],
319 |         'UserIdGroupPairs': [{'Description': 'efa',
320 |                               'GroupId': other_security_group.id,
321 |                               'UserId': user_id}]
322 |       }
323 |       response_ = this_security_group.authorize_egress(IpPermissions=[rule])
324 |       assert u.is_good_response(response_), str(response)
325 | 
326 |     except Exception as e:
327 |       if 'Error' in response_ and 'Code' in response['Error'] and response_['Error']['Code'] == 'InvalidPermission.Duplicate':
328 |         print(f"Warning while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to "
329 |               f"{other_security_group.description} ({other_security_group.id}) with message '{e}'")
330 | 
331 |       else:
332 |         assert False, (f"Failed while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to "
333 |                        f"{other_security_group.description} ({other_security_group.id}) with message '{e}' and response '{response}'")
334 | 
335 |   authorize_from_group(security_group, security_group)
336 |   # if using multiple security groups, which is required for the case of default + non-default VPC
337 |   # also authorize all traffic between them
338 |   if other_group:
339 |     authorize_from_group(security_group, other_group)
340 |     authorize_from_group(other_group, security_group)
341 | 
342 |   return security_group
343 | 
344 | 
345 | def create_resources():
346 |   print(f"Creating {u.get_prefix()} resources in region {u.get_region()}")
347 | 
348 |   vpc, security_group = network_setup()
349 |   keypair_setup()  # saves private key locally to keypair_fn
350 | 
351 |   # create EFS
352 |   efss = u.get_efs_dict()
353 |   efs_name = u.get_efs_name()
354 |   efs_id = efss.get(efs_name, '')
355 |   if not efs_id:
356 |     print("Creating EFS " + efs_name)
357 |     efs_id = u.create_efs(efs_name)
358 |   else:
359 |     print("Reusing EFS " + efs_name)
360 | 
361 |   efs_client = u.get_efs_client()
362 | 
363 |   # create mount target for each subnet in the VPC
364 | 
365 |   # added retries because efs is not immediately available
366 |   max_failures = 10
367 |   retry_interval_sec = 1
368 |   for subnet in vpc.subnets.all():
369 |     for retry_attempt in range(max_failures):
370 |       try:
371 |         sys.stdout.write(
372 |           "Creating efs mount target for %s ... " % (subnet.availability_zone,))
373 |         sys.stdout.flush()
374 |         response = efs_client.create_mount_target(FileSystemId=efs_id,
375 |                                                   SubnetId=subnet.id,
376 |                                                   SecurityGroups=[
377 |                                                     security_group.id])
378 |         if u.is_good_response(response):
379 |           print("success")
380 |           break
381 |       except Exception as e:
382 |         if 'already exists' in str(e):  # ignore "already exists" errors
383 |           print('already exists')
384 |           break
385 | 
386 |         # Takes couple of seconds for EFS to come online, with
387 |         # errors like this:
388 |         # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec
389 | 
390 |         print("Got %s, retrying in %s sec" % (str(e), retry_interval_sec))
391 |         time.sleep(retry_interval_sec)
392 |     else:
393 |       print("Giving up.")
394 | 
395 | 
396 | if __name__ == '__main__':
397 |   create_resources()
398 | 


--------------------------------------------------------------------------------
/ncluster/ncluster_cloud_wipe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Deletes resources
  4 | 
  5 | import sys
  6 | import os
  7 | import argparse
  8 | 
  9 | from ncluster import aws_util as u
 10 | from ncluster import util
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--kind', type=str, default='all',
 14 |                     help="which resources to delete, all/network/keypair/efs")
 15 | parser.add_argument('--force_delete_efs', action='store_true',
 16 |                     help="force deleting main EFS")
 17 | args = parser.parse_args()
 18 | 
 19 | EFS_NAME = u.get_prefix()
 20 | VPC_NAME = u.get_prefix()
 21 | SECURITY_GROUP_NAME = u.get_prefix()
 22 | ROUTE_TABLE_NAME = u.get_prefix()
 23 | KEYPAIR_NAME = u.get_keypair_name()
 24 | 
 25 | client = u.get_ec2_client()
 26 | ec2 = u.get_ec2_resource()
 27 | 
 28 | 
 29 | def response_type(response):
 30 |   return 'ok' if u.is_good_response(response) else 'failed'
 31 | 
 32 | 
 33 | def delete_efs():
 34 |   efss = u.get_efs_dict()
 35 |   efs_id = efss.get(EFS_NAME, '')
 36 |   efs_client = u.get_efs_client()
 37 |   if efs_id:
 38 |     try:
 39 |       # delete mount targets first
 40 |       print("About to delete %s (%s)" % (efs_id, EFS_NAME))
 41 |       response = efs_client.describe_mount_targets(FileSystemId=efs_id)
 42 |       assert u.is_good_response(response)
 43 |       for mount_response in response['MountTargets']:
 44 |         id_ = mount_response['MountTargetId']
 45 |         sys.stdout.write('Deleting mount target %s ... ' % (id_,))
 46 |         sys.stdout.flush()
 47 |         response = efs_client.delete_mount_target(MountTargetId=id_)
 48 |         print(response_type(response))
 49 | 
 50 |       sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, EFS_NAME))
 51 |       sys.stdout.flush()
 52 |       u.delete_efs_by_id(efs_id)
 53 | 
 54 |     except Exception as e:
 55 |       sys.stdout.write(f'failed with {e}\n')
 56 |       util.log_error(str(e) + '\n')
 57 | 
 58 | 
 59 | def delete_network():
 60 |   if u.get_region() == 'us-east-1':
 61 |     util.log("(Internal safety switch. Not deleting resources in us-east-1, remove this line if you are really sure")
 62 |     return
 63 | 
 64 |   def delete_vpc(vpc, partial=True):
 65 |     """Deletes VPC + all resources, if "partial" set to True, only deletes associated security groups
 66 |     """
 67 |     print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id))
 68 | 
 69 |     # don't modify default VPC
 70 |     if not partial:
 71 |       for subnet in vpc.subnets.all():
 72 |         try:
 73 |           sys.stdout.write("Deleting subnet %s ... " % subnet.id)
 74 |           sys.stdout.write(response_type(subnet.delete()) + '\n')
 75 |         except Exception as e:
 76 |           sys.stdout.write('failed\n')
 77 |           util.log_error(str(e) + '\n')
 78 | 
 79 |       for gateway in vpc.internet_gateways.all():
 80 |         sys.stdout.write("Deleting gateway %s ... " % gateway.id)
 81 |         # note: if instances are using VPC, this fails with
 82 |         # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway.
 83 | 
 84 |         sys.stdout.write('detached ... ' if u.is_good_response(
 85 |           gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ')
 86 |         sys.stdout.write('deleted ' if u.is_good_response(
 87 |           gateway.delete()) else ' delete_failed ')
 88 |         sys.stdout.write('\n')
 89 | 
 90 |       def desc():
 91 |         return "%s (%s)" % (route_table.id, u.get_name(route_table.tags))
 92 | 
 93 |       for route_table in vpc.route_tables.all():
 94 |         sys.stdout.write(f"Deleting route table {desc()} ... ")
 95 |         try:
 96 |           sys.stdout.write(response_type(route_table.delete()) + '\n')
 97 |         except Exception as e:
 98 |           sys.stdout.write('failed\n')
 99 |           util.log_error(str(e) + '\n')
100 | 
101 |     else:
102 |       util.log(f"vpc {vpc.id} is a default VPC, only doing partial deletion")
103 | 
104 |     def desc():
105 |       return "%s (%s, %s)" % (
106 |         security_group.id, u.get_name(security_group.tags),
107 |         security_group.group_name)
108 | 
109 |     ncluster_security_groups = u.get_security_group_names()
110 |     for security_group in vpc.security_groups.all():
111 |       # default group is undeletable, skip
112 |       if security_group.group_name == 'default':
113 |         continue
114 | 
115 |       # don't delete groups created outside of ncluster framework
116 |       if security_group.group_name not in ncluster_security_groups:
117 |         continue
118 | 
119 |       sys.stdout.write(
120 |         'Deleting security group %s ... ' % (desc()))
121 |       try:
122 |         sys.stdout.write(response_type(security_group.delete()) + '\n')
123 |       except Exception as e:
124 |         sys.stdout.write('failed\n')
125 |         util.log_error(str(e) + '\n')
126 | 
127 |     if not partial:
128 |       sys.stdout.write("Deleting VPC %s ... " % vpc.id)
129 |       try:
130 |         sys.stdout.write(response_type(vpc.delete()) + '\n')
131 |       except Exception as e:
132 |         sys.stdout.write('failed\n')
133 |         util.log_error(str(e) + '\n')
134 | 
135 |   existing_vpcs = u.get_vpc_dict()
136 |   if VPC_NAME in existing_vpcs:
137 |     # delete ncluster VPC
138 |     delete_vpc(ec2.Vpc(existing_vpcs[VPC_NAME].id), partial=False)
139 | 
140 |   # delete ncluster resources on default VPC (partial=True)
141 |   delete_vpc(u.get_default_vpc())
142 | 
143 | 
144 | def delete_keypair():
145 |   keypairs = u.get_keypair_dict()
146 |   keypair = keypairs.get(KEYPAIR_NAME, '')
147 |   if keypair:
148 |     try:
149 |       sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name,
150 |                                                           KEYPAIR_NAME))
151 |       sys.stdout.write(response_type(keypair.delete()) + '\n')
152 |     except Exception as e:
153 |       sys.stdout.write('failed\n')
154 |       util.log_error(str(e) + '\n')
155 | 
156 |   keypair_fn = u.get_keypair_fn()
157 |   if os.path.exists(keypair_fn):
158 |     print("Deleting local keypair file %s" % (keypair_fn,))
159 |     os.system('rm -f ' + keypair_fn)
160 | 
161 | 
162 | def delete_resources(force_delete_efs=False):
163 |   region = os.environ['AWS_DEFAULT_REGION']
164 | 
165 |   resource = u.get_prefix()
166 |   answer = input(f"Deleting resources for account {u.get_account_number()}:{u.get_account_name()}, region {u.get_region()}, sure? (y/N) ")
167 | 
168 |   if util.is_set("NCLUSTER_SKIP_CONFIRMATION"):
169 |     print("NCLUSTER_SKIP_CONFIRMATION is set, skipping confirmation")
170 |     answer = 'y'
171 | 
172 |   if not answer.lower() == "y":
173 |     print("Didn't get y, doing nothing")
174 |     return
175 | 
176 |   print(f"Deleting {resource} resources in region {region}")
177 |   print(f"Make sure {resource} instances are terminated or this will fail.")
178 | 
179 |   if 'efs' in args.kind or 'all' in args.kind:
180 |     if EFS_NAME == u.DEFAULT_PREFIX and not force_delete_efs:
181 |       # this is default EFS, likely has stuff, require extra flag to delete it
182 |       print("default EFS has useful stuff in it, not deleting it. Use force_delete_efs "
183 |             "flag to force. This means security group deletion will fail as well.")
184 |     else:
185 |       delete_efs()
186 |   if 'network' in args.kind or 'all' in args.kind:
187 |     delete_network()
188 |   if 'keypair' in args.kind or 'all' in args.kind:
189 |     delete_keypair()
190 | 
191 | 
192 | if __name__ == '__main__':
193 |   delete_resources(force_delete_efs=args.force_delete_efs)
194 | 


--------------------------------------------------------------------------------
/ncluster/ncluster_globals.py:
--------------------------------------------------------------------------------
  1 | """Module that keeps global state of ncluster tasks, such as naming,
  2 | connection of tasks to runs.
  3 | 
  4 | run refers to string name
  5 | run_object refers to Run object corresponding to that name
  6 | 
  7 | """
  8 | import os
  9 | import sys
 10 | from typing import Dict, Any, List
 11 | 
 12 | from . import aws_backend as backend
 13 | from . import util
 14 | 
 15 | task_launched = False  # keep track whether anything has been launched
 16 | 
 17 | task_counter = 0
 18 | job_counter = 0
 19 | run_counter = 0
 20 | 
 21 | run_dict: Dict[str, Any] = {}
 22 | task_run_dict: Dict["backend.Task", str] = {}
 23 | run_task_dict: Dict[str, List["backend.Task"]] = {}
 24 | run_logdir_dict: Dict[str, str] = {}
 25 | 
 26 | tasks_seen: List["backend.Task"] = []  # list of all tasks created
 27 | 
 28 | enforce_placement_group_val = False
 29 | 
 30 | 
 31 | def enforce_placement_group():
 32 |   """Enforces all tasks to be launched into placement group."""
 33 |   global enforce_placement_group_val
 34 |   enforce_placement_group_val = True
 35 | 
 36 | 
 37 | def unenforce_placement_group():
 38 |   """Enforces all tasks to be launched into placement group."""
 39 |   global enforce_placement_group_val
 40 |   enforce_placement_group_val = False
 41 | 
 42 | 
 43 | def is_enforced_placement_group():
 44 |   return enforce_placement_group_val
 45 | 
 46 | 
 47 | def auto_assign_task_name_if_needed(name, instance_type='', image_name='',
 48 |                                     tasks=1):
 49 |   global task_counter
 50 |   if name:
 51 |     return name
 52 | 
 53 |   main_script = os.path.abspath(sys.argv[0])
 54 |   script_id = util.alphanumeric_hash(
 55 |     f"{main_script}-{instance_type}-{image_name}-{tasks}")
 56 |   name = f"unnamedtask-{task_counter}-{script_id}"
 57 |   task_counter += 1
 58 |   return name
 59 | 
 60 | 
 61 | def auto_assign_job_name_if_needed(name):
 62 |   global job_counter
 63 |   if name:
 64 |     return name
 65 |   script_id = util.alphanumeric_hash(sys.argv[0])
 66 |   name = f"unnamedjob-{job_counter}-{script_id}"
 67 |   job_counter += 1
 68 |   return name
 69 | 
 70 | 
 71 | def auto_assign_run_name_if_needed(name):
 72 |   global run_counter
 73 |   if name:
 74 |     return name
 75 |   script_id = util.alphanumeric_hash(sys.argv[0])
 76 |   name = f"unnamedrun-{run_counter}-{script_id}"
 77 |   run_counter += 1
 78 |   return name
 79 | 
 80 | 
 81 | def register_task(task: Any, run_name: str):
 82 |   global task_run_dict, run_task_dict, tasks_seen
 83 |   assert task.name not in tasks_seen
 84 |   tasks_seen.append(task.name)
 85 |   task_run_dict[task] = run_name
 86 |   run_task_list = run_task_dict.get(run_name, [])
 87 |   run_task_list.append(task)
 88 | 
 89 |   # disable check because it's useless (instance creation fails with missing placement group before getting to register_task)
 90 |   # enforce uniformity -- either all tasks in a run are reused (assuming 1 job per run) or all tasks are created fresh
 91 |   # has_reuse = sum(task.instance_reuse for task in run_task_list)
 92 |   # has_fresh = sum(not task.instance_reuse for task in run_task_list)
 93 |   # if has_reuse + has_fresh != 1:
 94 |   #   tasks_to_kill = [task.name for task in run_task_list]
 95 |   #   print(f"Fatal: trying to reuse some instances while recreating others. Launching a group requires launching all "
 96 |   #         f"instances together. Kill following instances and try again: {','.join(tasks_to_kill)}")
 97 |   #   for task in run_task_list:
 98 |   #     print(f"{task.name}: {'reused' if task.instance_reuse else 'fresh'}")
 99 |   #   os.kill(os.getpid(), signal.SIGTERM)  # sys.exit() doesn't work inside thread
100 | 
101 | 
102 | def register_run(run: "backend.Run", run_name: str) -> None:
103 |   print(f"Registering run {run_name}")
104 |   assert run_name not in run_dict
105 |   assert run_name  # empty name reserved to mean no run
106 |   run_dict[run_name] = run
107 | 
108 | 
109 | def is_chief(task: "backend.Task", run_name: str):
110 |   """Returns True if task is chief task in the corresponding run"""
111 |   global run_task_dict
112 |   if run_name not in run_task_dict:
113 |     return True
114 |   task_list = run_task_dict[run_name]
115 |   assert task in task_list, f"Task {task.name} doesn't belong to run {run_name}"
116 |   return task_list[0] == task
117 | 
118 | 
119 | def get_chief(run_name: str):
120 |   assert run_name in run_task_dict, f"Run {run_name} doesn't exist"
121 |   tasks = run_task_dict[run_name]
122 |   assert tasks, f"Run {run_name} had tasks {tasks}, expected non-empty list"
123 |   return tasks[0]
124 | 
125 | 
126 | def get_logdir(run_name: str):
127 |   """Returns logdir for this run. It is the job of logdir creator to set logdir for this run"""
128 | 
129 |   if not run_name:
130 |     return '/tmp'
131 |   return run_logdir_dict.get(run_name, '')
132 | 
133 | 
134 | def set_logdir(run_name, logdir):
135 |   assert run_name not in run_logdir_dict, f"logdir for run {run_name} has already been set to {run_logdir_dict[run_name]}, trying to change it to {logdir} is illegal"
136 |   run_logdir_dict[run_name] = logdir
137 | 
138 | 
139 | def get_run_for_task(task: "backend.Task") -> str:
140 |   """Gets run name associated with given Task"""
141 |   return task_run_dict.get(task, '')
142 | 
143 | 
144 | def get_run_object(run_name: str) -> "backend.Run":
145 |   return run_dict.get(run_name, None)
146 | 
147 | 
148 | def create_run_if_needed(run_name, run_creation_callback) -> "backend.Run":
149 |   if run_name in run_dict:
150 |     return run_dict[run_name]
151 |   run = run_creation_callback(run_name)
152 |   return run
153 | 
154 | 
155 | _should_skip_setup = False
156 | 
157 | 
158 | def set_should_skip_setup(val):
159 |   global _should_skip_setup
160 |   if val:
161 |     util.log("skipping setup for all subsequent tasks/jobs")
162 |   _should_skip_setup = val
163 | 
164 | 
165 | def should_skip_setup():
166 |   return _should_skip_setup
167 | 


--------------------------------------------------------------------------------
/ncluster/old_backend.py:
--------------------------------------------------------------------------------
  1 | """Interface for job launching backend.
  2 | 
  3 | Run/Job and Task are container classes encapsulating functionality.
  4 | User creates them through make_run/make_job/make_task methods
  5 | 
  6 | """
  7 | # Job launcher Python API: https://docs.google.com/document/d/1yTkb4IPJXOUaEWksQPCH7q0sjqHgBf3f70cWzfoFboc/edit
  8 | # AWS job launcher (concepts): https://docs.google.com/document/d/1IbVn8_ckfVO3Z9gIiE0b9K3UrBRRiO9HYZvXSkPXGuw/edit
  9 | import threading
 10 | import time
 11 | from typing import List, Tuple, Any, Optional
 12 | 
 13 | from . import util
 14 | 
 15 | # aws_backend.py
 16 | # local_backend.py
 17 | 
 18 | LOGDIR_ROOT: Optional[str] = None  # location of logdir for this backend
 19 | 
 20 | """
 21 | backend = aws_backend # alternatively, backend=tmux_backend to launch jobs locally in separate tmux sessions
 22 | run = backend.make_run("helloworld")  # sets up /efs/runs/helloworld
 23 | worker_job = run.make_job("worker", instance_type="g3.4xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
 24 | ps_job = run.make_job("ps", instance_type="c5.xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
 25 | setup_tf_config(worker_job, ps_job)
 26 | ps_job.run("python cifar10_main.py --num_gpus=0")  # runs command on each task
 27 | worker_job.run("python cifar10_main.py --num_gpus=4")
 28 | 
 29 | tb_job = run.make_job("tb", instance_type="m4.xlarge", num_tasks=1, public_port=6006)
 30 | tb_job.run("tensorboard --logdir=%s --port=%d" %(run.logdir, 6006))
 31 | # when job has one task, job.task[0].ip can be accessed as job.ip
 32 | print("See TensorBoard progress on %s:%d" %(tb_job.ip, 6006))
 33 | print("To interact with workers: %s" %(worker_job.connect_instructions))
 34 | 
 35 | 
 36 | To reconnect to existing job:
 37 | 
 38 | """
 39 | 
 40 | 
 41 | class Task:
 42 |   name: str
 43 |   ip: Optional[str]
 44 |   public_ip: Optional[str]
 45 |   run_counter: int
 46 |   # location where temporary files from interfacing with task go locally
 47 |   local_scratch: Optional[str]
 48 |   # location where temporary files from interfacing with task go on task
 49 |   remote_scratch: Optional[str]
 50 |   job: Any  # can't declare Job because of circular dependency
 51 | 
 52 |   def __init__(self, name=''):
 53 |     """Wraps execution resources into a task. Runs install_script if present"""
 54 |     self.last_status = None
 55 |     self.name = name
 56 |     self.instance = None
 57 |     self.install_script = None
 58 |     self.job = None
 59 |     self.kwargs = None
 60 |     self.public_ip = None
 61 |     self.ip = None
 62 |     self.logdir_ = None
 63 | 
 64 |   @property
 65 |   def logdir(self):
 66 |     raise NotImplementedError()
 67 | 
 68 |   def run(self, cmd: str, non_blocking=False, ignore_errors=False):
 69 |     """Runs command on given task."""
 70 |     raise NotImplementedError()
 71 | 
 72 |   # TODO: reuse regular run
 73 |   def run_with_output(self, cmd, non_blocking=False, ignore_errors=False) -> \
 74 |           Tuple[str, str]:
 75 |     """
 76 | 
 77 |     Args:
 78 |       cmd: single line shell command to run
 79 |       non_blocking (bool): if True, does not wait for command to finish
 80 |       ignore_errors: if True, will succeed even if command failed
 81 | 
 82 |     Returns:
 83 |       Contents of stdout/stderr as strings.
 84 |     Raises
 85 |       RuntimeException: if command produced non-0 returncode
 86 | 
 87 |     """
 88 | 
 89 |     assert '\n' not in cmd, "Do not support multi-line commands"
 90 |     cmd: str = cmd.strip()
 91 |     if not cmd or cmd.startswith('#'):  # ignore empty/commented out lines
 92 |       return '', ''
 93 | 
 94 |     stdout_fn = f"{self.remote_scratch}/{self.run_counter+1}.stdout"
 95 |     stderr_fn = f"{self.remote_scratch}/{self.run_counter+1}.stderr"
 96 |     cmd2 = f"{cmd} > {stdout_fn} 2> {stderr_fn}"
 97 | 
 98 |     assert not non_blocking, "Getting output doesn't work with non_blocking"
 99 |     status = self.run(cmd2, False, ignore_errors=True)
100 |     stdout = self.read(stdout_fn)
101 |     stderr = self.read(stderr_fn)
102 | 
103 |     if self.last_status > 0:
104 |       self.log(f"Warning: command '{cmd}' returned {status},"
105 |                f" stdout was '{stdout}' stderr was '{stderr}'")
106 |       if not ignore_errors:
107 |         raise RuntimeError(f"Warning: command '{cmd}' returned {status},"
108 |                            f" stdout was '{stdout}' stderr was '{stderr}'")
109 | 
110 |     return stdout, stderr
111 | 
112 |   def wait_for_file(self, fn: str, max_wait_sec: int = 3600 * 24 * 365,
113 |                     check_interval: float = 0.02) -> bool:
114 |     """
115 |     Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec
116 |     Args:
117 |       fn: filename on task machine
118 |       max_wait_sec: how long to wait in seconds
119 |       check_interval: how often to check in seconds
120 |     Returns:
121 |       False if waiting was was cut short by max_wait_sec limit, True otherwise
122 |     """
123 |     #    print("Waiting for file", fn)
124 |     start_time = time.time()
125 |     while True:
126 |       if time.time() - start_time > max_wait_sec:
127 |         util.log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}")
128 |         return False
129 |       if not self.exists(fn):
130 |         time.sleep(check_interval)
131 |         continue
132 |       else:
133 |         break
134 |     return True
135 | 
136 |   def _run_raw(self, cmd):
137 |     """Runs command directly on every task in the job, skipping tmux interface. Use if want to create/manage additional tmux sessions manually."""
138 |     raise NotImplementedError()
139 | 
140 |   def upload(self, local_fn: str, remote_fn: str = '',
141 |              dont_overwrite: bool = False):
142 |     """Uploads given file to the task. If remote_fn is not specified, dumps it
143 |     into task current directory with the same name.
144 | 
145 |     Args:
146 |       local_fn: location of file locally
147 |       remote_fn: location of file on task
148 |       dont_overwrite: if True, will be no-op if target file exists
149 |       """
150 |     raise NotImplementedError()
151 | 
152 |   def download(self, remote_fn: str, local_fn: str = ''):
153 |     """Downloads remote file to current directory."""
154 |     raise NotImplementedError()
155 | 
156 |   def write(self, fn, contents):
157 |     """Write string contents to file fn in task."""
158 |     raise NotImplementedError()
159 | 
160 |   def read(self, fn):
161 |     """Read contents of file and return it as string."""
162 |     raise NotImplementedError()
163 | 
164 |   def exists(self, fn) -> bool:
165 |     """Checks if fn exists on task
166 | 
167 |     Args:
168 |       fn: filename local to task
169 |     Returns:
170 |       true if fn exists on task machine
171 |     """
172 |     raise NotImplementedError()
173 | 
174 |   def log(self, message, *args):
175 |     """Log to launcher console."""
176 |     if args:
177 |       message %= args
178 | 
179 |     print(f"{util.current_timestamp()} {self.name}: {message}")
180 | 
181 | 
182 | class Job:
183 |   name: str
184 |   tasks: List[Task]
185 | 
186 |   #  run_: Run
187 | 
188 |   def __init__(self, name: str, tasks: List[Task] = None, **kwargs):
189 |     """Initializes Job object, links tasks to refer back to the Job."""
190 |     if tasks is None:
191 |       tasks = []
192 |     self.name = name
193 |     self.tasks = tasks
194 |     self.kwargs = kwargs
195 |     # TODO: maybe backlinking is not needed
196 |     for task in tasks:
197 |       task.job = self
198 | 
199 |   @property
200 |   def logdir(self):
201 |     return self.tasks[0].logdir
202 | 
203 |   def _task_parallel(self, method, *args, **kwargs):
204 |     """Runs given method on every task in the job in parallel. Blocks until all tasks finish. Propagates exception from first
205 |     failed task."""
206 | 
207 |     exceptions = []
208 | 
209 |     def task_run(task):
210 |       try:
211 |         getattr(task, method)(*args, **kwargs)
212 |       except Exception as e:
213 |         exceptions.append(e)
214 | 
215 |     threads = [threading.Thread(name=f'task_{method}_{i}',
216 |                                 target=task_run, args=[t])
217 |                for i, t in enumerate(self.tasks)]
218 |     for thread in threads:
219 |       thread.start()
220 |     for thread in threads:
221 |       thread.join()
222 |     if exceptions:
223 |       raise exceptions[0]
224 | 
225 |   def run(self, *args, **kwargs):
226 |     """Runs command on every task in the job in parallel, blocks until all tasks finish.
227 |     See Task for documentation of args/kwargs."""
228 |     return self._task_parallel("run", *args, **kwargs)
229 | 
230 |   def propagate_env(self, *args, **kwargs):
231 |     """See py:func:`aws_backend.Task.propagate_env`"""
232 |     return self._task_parallel("propagate_env", *args, **kwargs)
233 | 
234 |   def run_with_output(self, *args, **kwargs):
235 |     """Runs command on every task in the job in parallel, blocks until all tasks finish.
236 |     See Task for documentation of args/kwargs."""
237 |     return self._task_parallel("run_with_output", *args, **kwargs)
238 | 
239 |   def rsync(self, *args, **kwargs):
240 |     """See :py:func:`backend.Task.rsync`"""
241 |     return self._task_parallel("rsync", *args, **kwargs)
242 | 
243 |   def upload(self, *args, **kwargs):
244 |     """See :py:func:`backend.Task.upload`"""
245 |     return self._task_parallel("upload", *args, **kwargs)
246 | 
247 |   def write(self, *args, **kwargs):
248 |     return self._task_parallel("write", *args, **kwargs)
249 | 
250 |   def _run_raw(self, *args, **kwargs):
251 |     return self._task_parallel("_run_raw", *args, **kwargs)
252 | 
253 | 
254 | # Implementation needs to be backend specific so that run.create_job calls backend-specific method
255 | class Run:
256 |   """Run is a collection of jobs that share state. IE, training run will contain gradient worker job, parameter
257 |   server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and
258 |   event files.
259 |   :ivar aws_placement_group_name: somedoc
260 |   """
261 |   jobs: List[Job]
262 | 
263 |   @property
264 |   def logdir(self):
265 |     raise NotImplementedError()
266 | 
267 |   # TODO: currently this is synchronous, use non_blocking wrapper like in Job to parallelize methods
268 |   def run(self, *args, **kwargs):
269 |     raise NotImplementedError()
270 | 
271 |   def run_with_output(self, *args, **kwargs):
272 |     raise NotImplementedError()
273 | 
274 |   def _run_raw(self, *args, **kwargs):
275 |     raise NotImplementedError()
276 | 
277 |   def upload(self, *args, **kwargs):
278 |     raise NotImplementedError()
279 | 
280 |   def make_job(self, name='', **kwargs):
281 |     raise NotImplementedError()
282 | 
283 | 
284 | def make_task(**_kwargs):
285 |   raise NotImplementedError()
286 | 
287 | 
288 | def make_job(**_kwargs):
289 |   raise NotImplementedError()
290 | 
291 | 
292 | def make_run(**_kwargs):
293 |   raise NotImplementedError()
294 | 


--------------------------------------------------------------------------------
/ncluster/summary.txt:
--------------------------------------------------------------------------------
1 | tf_two_machines -- 500 on t3, 910 on c3
2 | 
3 | 


--------------------------------------------------------------------------------
/ncluster/test.py:
--------------------------------------------------------------------------------
1 | 
2 | print("%20s" % ('asdfasdf',))
3 | print(f"{'asdfasdf':>20}")
4 | 
5 | print("%5.2f" % (5.5,))
6 | print(f"{5.5:5.2f}")
7 | 


--------------------------------------------------------------------------------
/ncluster/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various helper utilities used internally by ncluster project, that are not explicitly tied to AWS
  3 | """
  4 | 
  5 | import os
  6 | import random
  7 | import string
  8 | import sys
  9 | import time
 10 | from collections import Iterable
 11 | import shlex
 12 | 
 13 | from typing import Optional, Tuple
 14 | 
 15 | import portalocker
 16 | import paramiko
 17 | 
 18 | # starting value for now_micros (Aug 31, 2018), using this to make various timestamped names shorter
 19 | EPOCH_MICROS = 1535753974788163
 20 | 
 21 | 
 22 | # Whitelist of temporary settings customizable through env vars. These are work-arounds for issues that are
 23 | # don't have permanent solutions yet. Keep this small to avoid many moving parts.
 24 | env_settings = {
 25 |   'NCLUSTER_AUTHORIZED_KEYS',         # public keys used to authorize ssh access on all instances
 26 |   'NCLUSTER_AWS_FAST_ROOTDISK',       # request $1/hour high performance AWS disk
 27 |   'NCLUSTER_AWS_PLACEMENT_GROUP',     # name of placement group to use, use when adding machines to a previous launched job
 28 |   'NCLUSTER_DISABLE_PDB_HANDLER',     # don't intercept pdb exception by default
 29 |   'NCLUSTER_RUNNING_UNDER_CIRCLECI',  # special settings for non-interactive CircleCI integration test env
 30 |   #  'NCLUSTER_IMAGE',
 31 |   'NCLUSTER_SSH_USERNAME',            # used as workaround when Amazon Linux detection fails
 32 |   'NCLUSTER_ZONE',                    # zone spec for when automatic zone fails (p3dn's + spot instances)
 33 |   'NCLUSTER_AWS_FORCE_CREATE_RESOURCES',  # AWS resources are created ignoring automatic existence checks
 34 | }
 35 | 
 36 | 
 37 | # keep this here instead of aws_backend because it's used by its dependency aws_util
 38 | VALID_REGIONS = ['us-east-2',
 39 |                  'us-east-1',
 40 |                  'us-west-1',       # An error occurred (Unsupported) when calling the RunInstances operation
 41 |                  'us-west-2',
 42 |                  'ap-east-1',       # doesn't have ec2
 43 |                  'ap-south-1',      # no EFS
 44 |                  'ap-northeast-3',  # An error occurred (OptInRequired) when calling the DescribeVpcs operation
 45 |                  'ap-northeast-2',
 46 |                  'ap-southeast-1',
 47 |                  'ap-southeast-2',
 48 |                  'ap-northeast-1',
 49 |                  'ca-central-1',
 50 |                  'cn-north-1',      # account number
 51 |                  'cn-northwest-1',  # account number
 52 |                  'eu-central-1',
 53 |                  'eu-west-1',
 54 |                  'eu-west-2',
 55 |                  'eu-west-3',       # no EFS
 56 |                  'eu-north-1',      # no EFS
 57 |                  'sa-east-1',       # no EFS
 58 |                  'us-gov-east-1',   # not authorized
 59 |                  'us-gov-west-1',   # not authorized
 60 |                  ]
 61 | 
 62 | # print/validate custom settings
 63 | for v in os.environ:
 64 |   if v.startswith('NCLUSTER'):
 65 |     assert v in env_settings, f"Custom setting '{v}'='{os.environ[v]}' not in settings whitelist, if you" \
 66 |       f"are sure you need this setting, add it to the env_settings in {os.path.basename(__file__)}, otherwise 'unset {v}'"
 67 |     # the following are often set by default, so don't print them
 68 |     if v in {'NCLUSTER_AUTHORIZED_KEYS', 'NCLUSTER_ZONE'}:
 69 |       continue
 70 | 
 71 |     sys.stderr.write(f"ncluster env setting {v}={os.environ[v]}\n")
 72 | 
 73 | 
 74 | def is_iterable(k):
 75 |   return isinstance(k, Iterable)
 76 | 
 77 | 
 78 | def now_micros(absolute=False) -> int:
 79 |   """Return current micros since epoch as integer."""
 80 |   micros = int(time.time() * 1e6)
 81 |   if absolute:
 82 |     return micros
 83 |   return micros - EPOCH_MICROS
 84 | 
 85 | 
 86 | def now_millis(absolute=False) -> int:
 87 |   """Return current millis since epoch as integer."""
 88 |   millis = int(time.time() * 1e3)
 89 |   if absolute:
 90 |     return millis
 91 |   return millis - EPOCH_MICROS // 1000
 92 | 
 93 | 
 94 | def current_timestamp() -> str:
 95 |   # timestamp format from https://github.com/tensorflow/tensorflow/blob/155b45698a40a12d4fef4701275ecce07c3bb01a/tensorflow/core/platform/default/logging.cc#L80
 96 |   current_seconds = time.time()
 97 |   remainder_micros = int(1e6 * (current_seconds - int(current_seconds)))
 98 |   time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_seconds))
 99 |   full_time_str = "%s.%06d" % (time_str, remainder_micros)
100 |   return full_time_str
101 | 
102 | 
103 | def log_error(*args, **kwargs):
104 |   print(f"Error encountered {args} {kwargs}")
105 | 
106 | 
107 | def log(*args, **kwargs):
108 |   print(f"{args} {kwargs}")
109 | 
110 | 
111 | def install_pdb_handler():
112 |   """Automatically start pdb:
113 |       1. CTRL+\\ breaks into pdb.
114 |       2. pdb gets launched on exception.
115 |   """
116 | 
117 |   import signal
118 |   import pdb
119 | 
120 |   def handler(_signum, _frame):
121 |     pdb.set_trace()
122 |   signal.signal(signal.SIGQUIT, handler)
123 | 
124 |   # Drop into PDB on exception
125 |   # from https://stackoverflow.com/questions/13174412
126 |   def info(type_, value, tb):
127 |    if hasattr(sys, 'ps1') or not sys.stderr.isatty():
128 |       # we are in interactive mode or we don't have a tty-like
129 |       # device, so we call the default hook
130 |       sys.__excepthook__(type_, value, tb)
131 |    else:
132 |       import traceback
133 |       import pdb
134 |       # we are NOT in interactive mode, print the exception...
135 |       traceback.print_exception(type_, value, tb)
136 |       print()
137 |       # ...then start the debugger in post-mortem mode.
138 |       pdb.pm()
139 | 
140 |   sys.excepthook = info
141 | 
142 | 
143 | def shell_add_echo(script):
144 |   """Goes over each line script, adds "echo cmd" in front of each cmd.
145 | 
146 |   ls a
147 | 
148 |   becomes
149 | 
150 |   echo * ls a
151 |   ls a
152 |   """
153 |   new_script = ""
154 |   for cmd in script.split('\n'):
155 |     cmd = cmd.strip()
156 |     if not cmd:
157 |       continue
158 |     new_script += "echo \\* " + shlex.quote(cmd) + "\n"
159 |     new_script += cmd + "\n"
160 |   return new_script
161 | 
162 | 
163 | def shell_strip_comment(cmd):
164 |   """ hi # testing => hi"""
165 |   if '#' in cmd:
166 |     return cmd.split('#', 1)[0]
167 |   else:
168 |     return cmd
169 | 
170 | 
171 | def random_id(k=5):
172 |   """Random id to use for AWS identifiers."""
173 |   #  https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python
174 |   return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k))
175 | 
176 | 
177 | def alphanumeric_hash(s: str, size=5):
178 |   """Short alphanumeric string derived from hash of given string"""
179 |   import hashlib
180 |   import base64
181 |   hash_object = hashlib.md5(s.encode('ascii'))
182 |   s = base64.b32encode(hash_object.digest())
183 |   result = s[:size].decode('ascii').lower()
184 |   return result
185 | 
186 | 
187 | def reverse_taskname(name: str) -> str:
188 |   """
189 |   Reverses components in the name of task. Reversed convention is used for filenames since
190 |   it groups log/scratch files of related tasks together
191 | 
192 |   0.somejob.somerun -> somerun.somejob.0
193 |   0.somejob -> somejob.0
194 |   somename -> somename
195 | 
196 |   Args:
197 |     name: name of task
198 | 
199 |   """
200 |   components = name.split('.')
201 |   assert len(components) <= 3
202 |   return '.'.join(components[::-1])
203 | 
204 | 
205 | def is_bash_builtin(cmd):
206 |   """Return true if command is invoking bash built-in
207 |   """
208 |   # from compgen -b
209 |   bash_builtins = ['alias', 'bg', 'bind', 'alias', 'bg', 'bind', 'break',
210 |                    'builtin', 'caller', 'cd', 'command', 'compgen', 'complete',
211 |                    'compopt', 'continue', 'declare', 'dirs', 'disown', 'echo',
212 |                    'enable', 'eval', 'exec', 'exit', 'export', 'false', 'fc',
213 |                    'fg', 'getopts', 'hash', 'help', 'history', 'jobs', 'kill',
214 |                    'let', 'local', 'logout', 'mapfile', 'popd', 'printf',
215 |                    'pushd', 'pwd', 'read', 'readarray', 'readonly', 'return',
216 |                    'set', 'shift', 'shopt', 'source', 'suspend', 'test',
217 |                    'times', 'trap', 'true', 'type', 'typeset', 'ulimit',
218 |                    'umask', 'unalias', 'unset', 'wait']
219 |   toks = cmd.split()
220 |   if toks and toks[0] in bash_builtins:
221 |     return True
222 |   return False
223 | 
224 | 
225 | def is_set(name: str) -> bool:
226 |   """Helper method to check if given property is set"""
227 |   assert name in env_settings
228 | 
229 |   val = os.environ.get(name, '0')
230 |   return not (val == '0')
231 | 
232 | 
233 | def get_env(name: str) -> Optional[str]:
234 |   """Helper method to retrieve custom env setting, returns None if not set"""
235 |   assert name in env_settings
236 |   return os.environ.get(name, None)
237 | 
238 | 
239 | def set_env(name: str, value: str) -> None:
240 |   """Helper method to set custom env setting"""
241 |   assert name in env_settings
242 |   os.environ[name] = value
243 | 
244 | 
245 | def assert_script_in_current_directory():
246 |   """Assert fail if current directory is different from location of the script"""
247 | 
248 |   script = sys.argv[0]
249 |   assert os.path.abspath(os.path.dirname(script)) == os.path.abspath(
250 |     '.'), f"Change into directory of script {script} and run again."
251 | 
252 | 
253 | def validate_ncluster_job_name(name):
254 |   assert name.count(
255 |     '.') <= 1, "Job name has too many .'s (see ncluster design: Run/Job/Task hierarchy for  convention)"
256 | 
257 | 
258 | def toseconds(dt) -> float:
259 |   """Converts datetime object to seconds."""
260 |   return time.mktime(dt.utctimetuple())
261 | 
262 | 
263 | def wait_for_file(fn: str, max_wait_sec: int = 60,
264 |                   check_interval: float = 1) -> bool:
265 |     """
266 |     Waits for file maximum of max_wait_sec. Returns True if file was detected within specified max_wait_sec
267 |     Args:
268 |       fn: filename
269 |       max_wait_sec: how long to wait in seconds
270 |       check_interval: how often to check in seconds
271 |     Returns:
272 |       False if waiting was was cut short by max_wait_sec limit, True otherwise
273 |     """
274 |     log("Waiting for file", fn)
275 |     start_time = time.time()
276 |     while True:
277 |       if time.time() - start_time > max_wait_sec:
278 |         log(f"Timeout exceeded ({max_wait_sec} sec) for {fn}")
279 |         return False
280 |       if not os.path.exists(fn):
281 |         time.sleep(check_interval)
282 |         continue
283 |       else:
284 |         break
285 |     return True
286 | 
287 | 
288 | # locations of default keypair
289 | ID_RSA = os.environ['HOME'] + '/.ssh/id_rsa'
290 | ID_RSA_PUB = ID_RSA + '.pub'
291 | 
292 | 
293 | def setup_local_ssh_keys() -> str:
294 |   """Sanity checks on local ssh keypair and regenerate it if necessary. Returns location of public keypair file"""
295 | 
296 |   if os.path.exists(ID_RSA_PUB):
297 |     assert os.path.exists(ID_RSA), f"Public key {ID_RSA_PUB} exists but private key {ID_RSA} not found, delete {ID_RSA_PUB} and run again to regenerate pair"
298 |     log(f"Found local keypair {ID_RSA}")
299 |   elif os.path.exists(ID_RSA):
300 |     if not os.path.exists(ID_RSA_PUB):
301 |       if is_set('NCLUSTER_RUNNING_UNDER_CIRCLECI'):
302 |         pass
303 |       else:
304 |         assert os.path.exists(ID_RSA_PUB), f"Private key {ID_RSA} exists but public key {ID_RSA_PUB} not found, delete {ID_RSA} and run again to regenerate pair"
305 |     log(f"Found local keypair {ID_RSA}")
306 |   else:
307 |     log(f"Generating keypair {ID_RSA}")
308 |     with portalocker.Lock(ID_RSA+'.lock', timeout=5) as _:
309 |       os.system(f"ssh-keygen -t rsa -f {ID_RSA} -N ''")
310 |     os.system(f'rm {ID_RSA}.lock')
311 | 
312 |   return ID_RSA_PUB
313 | 
314 | 
315 | def get_authorized_keys() -> str:
316 |   """Appends local public key to NCLUSTER_AUTHORIZED_KEYS and returns in format key1;key2;key3;
317 |   The result can be assigned back to NCLUSTER_AUTHORIZED_KEYS env var"""
318 | 
319 |   assert os.path.exists(ID_RSA_PUB), f"{ID_RSA_PUB} not found, make sure to run 'ncluster keys'"
320 | 
321 |   current_key = open(ID_RSA_PUB).read().strip()
322 |   auth_keys = os.environ.get('NCLUSTER_AUTHORIZED_KEYS', '')
323 |   return auth_keys+';'+current_key+';'
324 | 
325 | 
326 | def get_public_key() -> str:
327 |   """Returns public key, creating it if needed"""
328 | 
329 |   if not os.path.exists(ID_RSA_PUB):
330 |     print(f"{ID_RSA_PUB} not found, running sure to run setup_local_ssh_keys()")
331 |     setup_local_ssh_keys()
332 |     assert os.path.exists(ID_RSA_PUB)
333 | 
334 |   return open(ID_RSA_PUB).read().strip()
335 | 
336 | 
337 | def exec_command(ssh: paramiko.SSHClient, command: str, bufsize=-1, timeout=None, get_pty=False, environment=None) -> Tuple[paramiko.ChannelFile, paramiko.ChannelFile, paramiko.ChannelFile, paramiko.Channel]:
338 |     """Copy of paramiko's exec_command which also returns the channel."""
339 | 
340 |     transport: paramiko.Transport = ssh.get_transport()
341 |     chan: paramiko.Channel = transport.open_session(timeout=timeout)
342 |     if get_pty:
343 |       chan.get_pty()
344 |     chan.settimeout(timeout)
345 |     if environment:
346 |       chan.update_environment(environment)
347 |     chan.exec_command(command)
348 |     stdin: paramiko.ChannelFile = chan.makefile("wb", bufsize)
349 |     stdout: paramiko.ChannelFile = chan.makefile("r", bufsize)
350 |     stderr: paramiko.ChannelFile = chan.makefile_stderr("r", bufsize)
351 |     return stdin, stdout, stderr, chan
352 | 
353 | 
354 | class timeit:
355 |   """Decorator to measure length of time spent in the block in millis and log
356 |   it to TensorBoard."""
357 | 
358 |   def __init__(self, tag=""):
359 |     self.tag = tag
360 | 
361 |   def __enter__(self):
362 |     self.start = time.perf_counter()
363 |     return self
364 | 
365 |   def __exit__(self, *args):
366 |     self.end = time.perf_counter()
367 |     interval_ms = 1000 * (self.end - self.start)
368 |     print(f'timeit({self.tag}): {interval_ms})')
369 | 
370 | 
371 | # no_op method/object that accept every signature
372 | class NoOp:
373 |   def __getattr__(self, *args):
374 |     def no_op(*_args, **_kwargs): pass
375 |     return no_op
376 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | botocore
 2 | boto3>=1.9.159   # needed for InterfaceType in create_instances
 3 | boto3_type_annotations # for dev use boto3_type_annotations_with_docs
 4 | # cryptography==2.4.2  # workaround for https://github.com/paramiko/paramiko/issues/1369
 5 | cryptography
 6 | paramiko
 7 | portalocker
 8 | portpicker
 9 | pytz
10 | wandb


--------------------------------------------------------------------------------
/requirements_benchmarks.txt:
--------------------------------------------------------------------------------
1 | tensorflow
2 | numpy
3 | torch
4 | ray
5 | 


--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | wrapt
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = ncluster
 3 | author = Yaroslav Bulatov, Andrew Shaw, Ben Mann
 4 | author_email = yaroslavvb@gmail.com
 5 | description= Lightweight interface to launching jobs in the cloud
 6 | long_description = file: README.md
 7 | long_description_content_type = text/markdown
 8 | license_file = LICENSE
 9 | url = https://github.com/yaroslavvb/ncluster
10 | classifiers =
11 |   Programming Language :: Python :: 3
12 |   License :: OSI Approved :: MIT License
13 |   Operating System :: OS Independent
14 | 
15 | [options]
16 | python_requires = >= 3.6
17 | setup_requires =
18 |   setuptools >= 38.6
19 |   pip >= 10
20 |   twine >= 1.11
21 | packages = find:
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import re
 3 | 
 4 | requirements = []
 5 | for line in open('requirements.txt'):
 6 |   req = line.split('#', 1)[0]  # strip comments
 7 |   requirements.append(req.strip())
 8 | 
 9 | # follow https://stackoverflow.com/a/7071358/419116
10 | VERSIONFILE = "ncluster/_version.py"
11 | verstrline = open(VERSIONFILE, "rt").read()
12 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
13 | mo = re.search(VSRE, verstrline, re.M)
14 | if mo:
15 |   verstr = mo.group(1)
16 | else:
17 |   raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,))
18 | 
19 | setup(scripts=['ncluster/ncluster_cloud_setup.py',  # also used as module
20 |                'ncluster/ncluster_cloud_wipe.py',
21 |                'tools/nsync',
22 |                'tools/ncluster'],
23 |       install_requires=requirements,
24 |       version=verstr,
25 |       )
26 | 


--------------------------------------------------------------------------------
/tests/integration_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import random
 5 | import string
 6 | import sys
 7 | 
 8 | import wandb
 9 | 
10 | # in test environments disable pdb intercept
11 | os.environ['NCLUSTER_DISABLE_PDB_HANDLER'] = '1'
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--name', type=str, default='integration_test', help="job name")
15 | parser.add_argument('--instance_type', type=str, default="c5.large")
16 | parser.add_argument('--num_tasks', type=int, default=2)
17 | parser.add_argument('--image_name', type=str, default='Deep Learning AMI (Ubuntu) Version 23.0')
18 | parser.add_argument('--spot', action='store_true',
19 |                     help='use spot instead of regular instances')
20 | 
21 | parser.add_argument('--nproc_per_node', type=int, default=1)
22 | parser.add_argument('--conda_env', type=str, default='pytorch_p36')
23 | 
24 | parser.add_argument('--skip_setup', action='store_true')
25 | parser.add_argument('--local_rank', default=0, type=int)
26 | 
27 | 
28 | parser.add_argument('--role', type=str, default='launcher',
29 |                     help='internal flag, launcher or worker')
30 | args = parser.parse_args()
31 | 
32 | 
33 | def random_id(k=5):
34 |     """Random id to use for AWS identifiers."""
35 |     #  https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits-in-python
36 |     return ''.join(random.choices(string.ascii_lowercase + string.digits, k=k))
37 | 
38 | 
39 | def launcher():
40 |     # run this test out of root directory of ncluster to capture .git and requirements.txt
41 |     script_fn = 'tests/integration_test.py'
42 |     
43 |     import ncluster
44 |     job = ncluster.make_job(**vars(args))
45 |     job.rsync('.')
46 |     job.run('pip install -r requirements.txt')
47 |     task0 = job.tasks[0]
48 |     
49 |     task0.run(f'python {script_fn} --role=worker --name={args.name}-{random_id()} --local_rank=0', stream_output=True)
50 | 
51 | 
52 | def main():
53 |     if args.role == "launcher":
54 |         launcher()
55 |     elif args.role == "worker":
56 |         # rank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', 0))  # ompi way
57 |         # rank = int(os.environ.get('RANK', '0'))  # pytorch way
58 |         rank = args.local_rank  # cmd args way
59 | 
60 |         if rank != 0:
61 |             os.environ['WANDB_MODE'] = 'dryrun'  # all wandb.log are no-op
62 |         wandb.init(project='ncluster', name=args.name, entity='circleci')
63 |         print(f"{os.uname()[1]} {rank} {' '.join(sys.argv)}")
64 |         sys.stdout.flush()
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/tests/join_test.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import pytest
 3 | 
 4 | def test():
 5 |   task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE)
 6 |   task.run("mkdir /illegal", non_blocking=True)
 7 |   task.join(ignore_errors=True)  # this succeed/print error message
 8 | 
 9 |   task.run("mkdir /illegal", non_blocking=True)
10 |   with pytest.raises(RuntimeError):
11 |     task.join()  # this should fail
12 | 
13 | if __name__ == '__main__':
14 |   test()
15 | 


--------------------------------------------------------------------------------
/tests/logdir_test.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # tests to make sure that logdir logic works
 3 | import inspect
 4 | import random
 5 | import sys
 6 | import threading
 7 | 
 8 | import ncluster
 9 | 
10 | 
11 | def test_two_jobs():
12 |   run = ncluster.make_run('logdir_test')
13 |   job1 = run.make_job('job1')
14 |   task1 = job1.tasks[0]
15 |   task1.run(f'echo hello > {task1.logdir}/message')
16 |   job2 = run.make_job('job2')
17 |   task2 = job2.tasks[0]
18 |   assert task2.read(f'{task2.logdir}/message').strip() == 'hello'
19 | 
20 | 
21 | def test_multiple_logdirs():
22 |   logdir1 = ncluster.get_logdir_root() + '/test1'
23 |   dummy_task = ncluster.make_task()
24 |   dummy_task.run(f'rm -Rf {logdir1}')
25 |   task1 = ncluster.make_task(run_name='test1')
26 |   assert task1.logdir == logdir1
27 | 
28 |   logdir2 = ncluster.get_logdir_root() + '/test2'
29 |   task2 = ncluster.make_task(run_name='test2')
30 |   dummy_task.run(f'rm -Rf {logdir2}*')
31 |   dummy_task.run(f'mkdir {logdir2}')
32 |   assert task2.logdir == logdir2 + '.01'
33 | 
34 | 
35 | def test_multiple_logdir_tasks():
36 |   n = 10
37 |   dummy_task = ncluster.make_task()
38 |   logdir1 = ncluster.get_logdir_root() + '/test1'
39 |   dummy_task.run(f'rm -Rf {logdir1}')
40 |   job = ncluster.make_job(run_name='test1', num_tasks=n)
41 | 
42 |   obtained_logdirs = []
43 | 
44 |   import wrapt
45 | 
46 |   @wrapt.synchronized
47 |   def query(i):
48 |     obtained_logdirs.append(job.tasks[i].logdir)
49 | 
50 |   threads = [threading.Thread(target=query, args=(i,)) for i in range(n)]
51 |   for thread in reversed(threads):
52 |     thread.start()
53 | 
54 |   random.shuffle(threads)
55 |   for thread in threads:
56 |     thread.join()
57 | 
58 |   assert len(set(obtained_logdirs)) == 1
59 |   assert obtained_logdirs[0] == logdir1
60 | 
61 | 
62 | def run_all_tests(module):
63 |   all_functions = inspect.getmembers(module, inspect.isfunction)
64 |   for name, func in all_functions:
65 |     if name.startswith('test'):
66 |       print("Testing " + name)
67 |       func()
68 |   print(module.__name__ + " tests passed.")
69 | 
70 | 
71 | def manual():
72 |   run_all_tests(sys.modules[__name__])
73 | 
74 | 
75 | if __name__ == '__main__':
76 |   manual()
77 | 


--------------------------------------------------------------------------------
/tests/many_commands_test.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | import ncluster.util as util
 3 | 
 4 | # Test for a fix to exception with too many concurrent connections ("paramiko.ssh_exception.ChannelException: (1, 'Administratively prohibited'))
 5 | 
 6 | 
 7 | 
 8 | def test():
 9 |   task = ncluster.make_task('test2')
10 |   for i in range(20):
11 |     task.run('ls', stream_output=True)
12 |   
13 | 
14 | if __name__ == '__main__':
15 |   test()
16 | 


--------------------------------------------------------------------------------
/tests/run_test.py:
--------------------------------------------------------------------------------
 1 | import ncluster
 2 | 
 3 | def test():
 4 |   run = ncluster.make_run('run_test')
 5 |   job1 = run.make_job('job1')
 6 |   task1 = job1.tasks[0]
 7 |   assert task1.name == '0.job1.run_test'
 8 |   task1.run(f'echo task1sayshello > {task1.logdir}/message')
 9 |   job2 = run.make_job('job2')
10 |   task2 = job2.tasks[0]
11 |   assert task2.name == '0.job2.run_test'
12 |   assert task2.read(f'{task2.logdir}/message').strip() == 'task1sayshello'
13 |   
14 | 
15 | if __name__ == '__main__':
16 |   test()
17 | 


--------------------------------------------------------------------------------
/tools/ncluster:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # tool to automate various AWS commands
  3 | import datetime as dt
  4 | import os
  5 | import shlex
  6 | import subprocess
  7 | import sys
  8 | import time
  9 | from typing import Dict
 10 | 
 11 | import pytz
 12 | 
 13 | import ncluster
 14 | 
 15 | from ncluster import aws_util as u
 16 | from ncluster import util
 17 | from ncluster.aws_backend import INSTANCE_INFO
 18 | from boto3_type_annotations.ec2 import Volume
 19 | from boto3_type_annotations.ec2 import Image
 20 | 
 21 | VERBOSE = False
 22 | 
 23 | 
 24 | def _run_shell(user_cmd):
 25 |   """Runs shell command, returns list of outputted lines
 26 | with newlines stripped"""
 27 |   #  print(cmd)
 28 |   p = subprocess.Popen(user_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 29 |   (stdout, _) = p.communicate()
 30 |   stdout = stdout.decode('ascii') if stdout else ''
 31 |   lines = stdout.split('\n')
 32 |   stripped_lines = []
 33 |   for l in lines:
 34 |     stripped_line = l.strip()
 35 |     if l:
 36 |       stripped_lines.append(stripped_line)
 37 |   return stripped_lines
 38 | 
 39 | 
 40 | def _check_instance_found(instances, fragment, states=()):
 41 |   if not instances:
 42 |     if states:
 43 |       print(f"Couldn't find instances in state {states} matching '{fragment}' for key {u.get_keypair_name()}")
 44 |     else:
 45 |       print(f"Couldn't find instances matching '{fragment}' for key {u.get_keypair_name()}")
 46 |     return False
 47 |   return True
 48 | 
 49 | 
 50 | def vprint(*args):
 51 |   if VERBOSE:
 52 |     print(*args)
 53 | 
 54 | 
 55 | def toseconds(dt_):
 56 |   """Converts datetime object to seconds."""
 57 |   return time.mktime(dt_.utctimetuple())
 58 | 
 59 | 
 60 | def ls(fragment=''):
 61 |   """List running instances"""
 62 |   print(f"https://console.aws.amazon.com/ec2/v2/home?region={u.get_region()}")
 63 | 
 64 |   stopped_instances = u.lookup_instances(fragment, states=['stopped'])
 65 |   stopped_names = list(u.get_name(i) for i in stopped_instances)
 66 |   if stopped_names:
 67 |     print("ignored stopped instances: ", ", ".join(stopped_names))
 68 | 
 69 |   instances = u.lookup_instances(fragment)
 70 |   print('-' * 80)
 71 |   print(
 72 |     f"{'name':18s} {'hours_live':>10s} {'cost_in_$':>10s} {'instance_type':>15s} {'public_ip':>15s} "
 73 |     f"{'key/owner':>15s} {'private ip':>15s}")
 74 |   print('-' * 80)
 75 |   for instance in instances[::-1]:
 76 |     # current time in UTC zone (default AWS)
 77 |     now_time = dt.datetime.utcnow().replace(tzinfo=pytz.utc)
 78 |     launch_time = instance.launch_time
 79 |     elapsed_sec = toseconds(now_time) - toseconds(launch_time)
 80 |     elapsed_hours = elapsed_sec / 3600
 81 |     instance_type = instance.instance_type
 82 |     if instance_type in INSTANCE_INFO:
 83 |       cost = INSTANCE_INFO[instance_type]['cost'] * elapsed_hours
 84 |     else:
 85 |       cost = -1
 86 |     key_name = str(instance.key_name)  # could be None
 87 |     print(f"{u.get_name(instance):18s} {elapsed_sec / 3600:10.1f} {cost:10.0f} {instance_type[:5]:>15s} "
 88 |           f"{instance.public_ip_address:>15s} {key_name[9:]:>15s} {instance.private_ip_address:>15s} "
 89 |           f"{instance.placement_group.name} ")
 90 | 
 91 |   # list spot requests, ignore active ones since they show up already
 92 |   client = u.get_ec2_client()
 93 |   spot_requests = []
 94 |   for request in client.describe_spot_instance_requests()['SpotInstanceRequests']:
 95 |     state = request['State']
 96 |     # TODO(y) also ignore state == 'fulfilled'?
 97 |     if state == 'cancelled' or state == 'closed' or state == 'active':
 98 |       continue
 99 | 
100 |     launch_spec = request['LaunchSpecification']
101 |     spot_requests.append(launch_spec['InstanceType'])
102 |   if spot_requests:
103 |     print(f"Pending spot instances: {','.join(spot_requests)}")
104 |   #   client.cancel_spot_instance_requests(SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
105 | 
106 | 
107 | def etchosts(_):
108 |   """Copy/pastable /etc/hosts file"""
109 |   instances = u.lookup_instances()
110 |   instance_tuples = [(u.get_name(i), i.public_ip_address) for i in instances]
111 |   print('-' * 80)
112 |   print("paste following into your /etc/hosts")
113 |   print('-' * 80)
114 |   for name, ip in sorted(instance_tuples):
115 |     print(f"{ip} {name}")
116 | 
117 |   print("""\n127.0.0.1    localhost
118 | 255.255.255.255    broadcasthost
119 | ::1             localhost""")
120 | 
121 | 
122 | def _user_keypair_check(instance):
123 |   launching_user = instance.key_name[len(u.get_prefix()) + 1:]
124 |   current_user = os.environ['USER']
125 |   assert launching_user == current_user, f"Set USER={launching_user} to connect to this machine, and make sure their " \
126 |                                          f".pem file is in your ~/.ncluster"
127 | 
128 | 
129 | def ssh(fragment=''):
130 |   """SSH into the instace with the given prefix."""
131 |   instances = u.lookup_instances(fragment)
132 |   if not _check_instance_found(instances, fragment):
133 |     return
134 |   instance = instances[0]
135 |   if len(instances) > 1:
136 |     print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)} "
137 |           f"launched by {instance.key_name}")
138 |   else:
139 |     print(f"Connecting to  {u.get_name(instance)} "
140 |           f"launched by {instance.key_name}")
141 | 
142 |   _user_keypair_check(instance)
143 |   user_cmd = f"ssh -t -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no -o ServerAliveCountMax=1 " \
144 |              f"-o ServerAliveInterval=60 " \
145 |              f"{u.get_aws_username(instance)}@{instance.public_ip_address} "
146 |   print(user_cmd)
147 |   os.system(user_cmd)
148 | 
149 | 
150 | def reboot(fragment=''):
151 |   """reboots given instance."""
152 |   instances = u.lookup_instances(fragment)
153 |   if not _check_instance_found(instances, fragment):
154 |     return
155 |   instance = instances[0]
156 |   if len(instances) > 1:
157 |     print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)} "
158 |           f"launched by {instance.key_name}")
159 |   else:
160 |     print(f"Rebooting to  {u.get_name(instance)}  ({instance.id})"
161 |           f"launched by {instance.key_name}")
162 | 
163 |   _user_keypair_check(instance)
164 |   instance.reboot()
165 | 
166 | 
167 | def old_ssh(fragment=''):
168 |   """SSH into the instace with the given prefix. Works on dumb terminals."""
169 |   instances = u.lookup_instances(fragment)
170 |   if not _check_instance_found(instances, fragment):
171 |     return
172 |   instance = instances[0]
173 |   if len(instances) > 1:
174 |     print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)} "
175 |           f"launched by {instance.key_name}")
176 |   else:
177 |     print(f"Connecting to  {u.get_name(instance)} "
178 |           f"launched by {instance.key_name}")
179 | 
180 |   _user_keypair_check(instance)
181 |   user_cmd = f"ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no -o ConnectTimeout=10  " \
182 |              f"-o ServerAliveCountMax=1 " \
183 |              f"-o ServerAliveInterval=60 " \
184 |              f"{u.get_aws_username(instance)}@{instance.public_ip_address}"
185 |   print(user_cmd)
186 |   os.system(user_cmd)
187 | 
188 | 
189 | def connect(fragment=''):
190 |   """SSH into the instance using authorized keys mechanism."""
191 |   instances = u.lookup_instances(fragment)
192 |   if not _check_instance_found(instances, fragment):
193 |     return
194 |   instance = instances[0]
195 |   if len(instances) > 1:
196 |     print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)} "
197 |           f"launched by {instance.key_name}")
198 |   else:
199 |     print(f"Connecting to  {u.get_name(instance)} "
200 |           f"launched by {instance.key_name}")
201 | 
202 |   ssh_cmd = f"ssh -t -o StrictHostKeyChecking=no -o ConnectTimeout=10  " \
203 |             f"-o ServerAliveCountMax=1 " \
204 |             f"-o ServerAliveInterval=60 " \
205 |             f"{u.get_aws_username(instance)}@{instance.public_ip_address} "
206 |   connect_cmd = ssh_cmd
207 |   do_tmux = False
208 |   if 'INSIDE_EMACS' in os.environ:
209 |     print("detected Emacs, skipping tmux attach")
210 |   elif os.environ.get('TERM', 'dumb') == 'dumb':
211 |     print("Dumb terminal, doesn't support tmux, skipping tmux attach")
212 |   elif 'NO_TMUX' in os.environ:
213 |     print("detected NO_TMUX, skipping tmux attach")
214 |   else:
215 |     do_tmux = True
216 |     connect_cmd += " tmux a"
217 | 
218 |   print(connect_cmd)
219 |   exit_code = os.system(connect_cmd)
220 | 
221 |   if exit_code != 0 and do_tmux:
222 |     fix_cmd = ssh_cmd + " tmux new"
223 |     print(f"Creating ssh tmux a returned {exit_code}, recreate tmux using '{fix_cmd}'")
224 |     return
225 |   else:
226 |     print(f"cmd {connect_cmd} returned {exit_code}")
227 | 
228 | 
229 | def connectm(fragment=''):
230 |   """Like connect, but uses mosh"""
231 |   instances = u.lookup_instances(fragment)
232 |   if not _check_instance_found(instances, fragment):
233 |     return
234 |   instance = instances[0]
235 |   if len(instances) > 1:
236 |     print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)} "
237 |           f"launched by {instance.key_name}")
238 |   else:
239 |     print(f"Connecting to  {u.get_name(instance)} "
240 |           f"launched by {instance.key_name}")
241 | 
242 |   user_cmd = f"mosh --ssh='ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10  " \
243 |              f"-o ServerAliveCountMax=1 " \
244 |              f"-o ServerAliveInterval=60' " \
245 |              f"{u.get_aws_username(instance)}@{instance.public_ip_address}"
246 |   print(user_cmd)
247 |   os.system(user_cmd)
248 | 
249 | 
250 | def kill(fragment: str = '', stop_instead_of_kill: bool = False):
251 |   """
252 | 
253 |   Args:
254 |     fragment:
255 |     stop_instead_of_kill: use stop_instances instead of terminate_instances
256 |   """
257 | 
258 |   if stop_instead_of_kill:
259 |     states = ['running']
260 |   else:
261 |     states = ['running', 'stopped']
262 |   instances = u.lookup_instances(fragment, states=states, limit_to_current_user=False)
263 |   instances_to_kill = []
264 |   instances_to_skip = []
265 |   users_to_skip = set()
266 |   instances_to_kill_formatted = []
267 |   for i in instances:
268 |     state = i.state['Name']
269 | 
270 |     if not LIMIT_TO_CURRENT_USER or i.key_name == u.get_keypair_name():
271 |       instances_to_kill_formatted.append(("  ", u.get_name(i), i.instance_type, i.key_name, state if state == 'stopped' else ''))
272 |       instances_to_kill.append(i)
273 |     else:
274 |       instances_to_skip.append(u.get_name(i))
275 |       users_to_skip.add(i.key_name[9:])
276 | 
277 |   if stop_instead_of_kill:
278 |     action = 'stopping'
279 |     override_action = 'reallystop'
280 |   else:
281 |     action = 'terminating'
282 |     override_action = 'reallykill'
283 | 
284 |   if instances_to_skip:
285 |     print(f"Skipping {','.join(instances_to_skip)} launched by ({', '.join(users_to_skip)}), override with {override_action}")
286 |   if not _check_instance_found(instances_to_kill, fragment, states):
287 |     return
288 | 
289 |   print(f"{action}:")
290 |   for line in instances_to_kill_formatted:
291 |     print(*line)
292 | 
293 |   ec2_client = u.get_ec2_client()
294 |   # don't ask for confirmation when stopping, erronous stopping has milder consequences
295 |   num_instances = len(instances_to_kill)
296 |   #  if util.is_set("NCLUSTER_SKIP_CONFIRMATION") or stop_instead_of_kill:
297 |   #  print("NCLUSTER_SKIP_CONFIRMATION is set or stop_instead_of_kill, skipping confirmation")
298 |   answer = input(f"{num_instances} instances found, {action} in {u.get_region()}? (y/N) ")
299 | 
300 |   if answer.lower() == "y":
301 |     instance_ids = [i.id for i in instances_to_kill]
302 | 
303 |     if stop_instead_of_kill:
304 |       response = ec2_client.stop_instances(InstanceIds=instance_ids)
305 |     else:
306 |       response = ec2_client.terminate_instances(InstanceIds=instance_ids)
307 | 
308 |     assert u.is_good_response(response), response
309 |     print(f"{action} {num_instances} instances: success")
310 |   else:
311 |     print("Didn't get y, doing nothing")
312 | 
313 | 
314 | def stop(fragment=''):
315 |   kill(fragment, stop_instead_of_kill=True)
316 | 
317 | 
318 | LIMIT_TO_CURRENT_USER = True
319 | 
320 | 
321 | def reallykill(*args, **kwargs):
322 |   """Kill instances, including ones launched by other users."""
323 |   global LIMIT_TO_CURRENT_USER
324 |   LIMIT_TO_CURRENT_USER = False
325 |   kill(*args, **kwargs)
326 |   LIMIT_TO_CURRENT_USER = True
327 | 
328 | 
329 | def reallystop(*args, **kwargs):
330 |   """Stop instances, including ones launched by other users."""
331 |   global LIMIT_TO_CURRENT_USER
332 |   LIMIT_TO_CURRENT_USER = False
333 |   stop(*args, **kwargs)
334 |   LIMIT_TO_CURRENT_USER = True
335 | 
336 | 
337 | def start(fragment=''):
338 |   instances = u.lookup_instances(fragment, states=['stopped'])
339 |   for i in instances:
340 |     print(u.get_name(i), i.instance_type, i.key_name)
341 | 
342 |   if not instances:
343 |     print("no stopped instances found, quitting")
344 |     return
345 | 
346 |   #  answer = input(f"{len(instances)} instances found, start in {u.get_region()}? (y/N) ")
347 |   answer = 'y'
348 | 
349 |   if answer.lower() == "y":
350 |     for i in instances:
351 |       print(f"starting {u.get_name(i)}")
352 |       i.start()
353 |   else:
354 |     print("Didn't get y, doing nothing")
355 |     return
356 | 
357 |   print("Warning, need to manually mount efs on instance: ")
358 |   print_efs_mount_command()
359 | 
360 | 
361 | def mosh(fragment=''):
362 |   instances = u.lookup_instances(fragment)
363 |   if not _check_instance_found(instances, fragment):
364 |     return
365 |   instance = instances[0]
366 |   print(f"Found {len(instances)} instances matching {fragment}, connecting to most recent  {u.get_name(instance)}")
367 |   _user_keypair_check(instance)
368 | 
369 |   user_cmd = f"mosh --ssh='ssh -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no' " \
370 |              f"{u.get_aws_username(instance)}@{instance.public_ip_address}"  # tmux attach"
371 |   print(user_cmd)
372 |   os.system(user_cmd)
373 | 
374 | 
375 | def print_efs_mount_command():
376 |   print(u.get_efs_mount_command())
377 | 
378 | 
379 | def efs(_):
380 |   print("EFS information. To upload to remote EFS use 'ncluster efs_sync'")
381 |   print_efs_mount_command()
382 |   print()
383 |   print()
384 | 
385 |   efs_client = u.get_efs_client()
386 |   response = efs_client.describe_file_systems()
387 |   assert u.is_good_response(response), response
388 | 
389 |   for efs_response in response['FileSystems']:
390 |     #  {'CreationTime': datetime.datetime(2017, 12, 19, 10, 3, 44, tzinfo=tzlocal()),
391 |     # 'CreationToken': '1513706624330134',
392 |     # 'Encrypted': False,
393 |     # 'FileSystemId': 'fs-0f95ab46',
394 |     # 'LifeCycleState': 'available',
395 |     # 'Name': 'nexus01',
396 |     # 'NumberOfMountTargets': 0,
397 |     # 'OwnerId': '316880547378',
398 |     # 'PerformanceMode': 'generalPurpose',
399 |     # 'SizeInBytes': {'Value': 6144}},
400 |     efs_id = efs_response['FileSystemId']
401 |     tags_response = efs_client.describe_tags(FileSystemId=efs_id)
402 |     assert u.is_good_response(tags_response)
403 |     key = u.get_name(tags_response.get('Tags', ''))
404 |     print("%-16s %-16s" % (efs_id, key))
405 |     print('-' * 40)
406 | 
407 |     # list mount points
408 |     response = efs_client.describe_mount_targets(FileSystemId=efs_id)
409 |     ec2 = u.get_ec2_resource()
410 |     if not response['MountTargets']:
411 |       print("<no mount targets>")
412 |     else:
413 |       for mount_response in response['MountTargets']:
414 |         subnet = ec2.Subnet(mount_response['SubnetId'])
415 |         zone = subnet.availability_zone
416 |         state = mount_response['LifeCycleState']
417 |         id_ = mount_response['MountTargetId']
418 |         ip = mount_response['IpAddress']
419 |         print('%-16s %-16s %-16s %-16s' % (zone, ip, id_, state,))
420 | 
421 | 
422 | def terminate_tmux(_):
423 |   """Script to clean-up tmux sessions."""
424 | 
425 |   for line in _run_shell('tmux ls'):
426 |     session_name = line.split(':', 1)[0]
427 | 
428 |     if session_name == 'tensorboard' or session_name == 'jupyter' or session_name == 'dropbox':
429 |       print("Skipping " + session_name)
430 |       continue
431 |     print("Killing " + session_name)
432 |     _run_shell('tmux kill-session -t ' + session_name)
433 | 
434 | 
435 | def nano(*_unused_args):
436 |   """Bring up t2.nano instance."""
437 |   ncluster.make_task(name='shell',
438 |                      instance_type='t2.nano')
439 | 
440 | 
441 | def cmd(user_cmd):
442 |   """Finds most recent instance launched by user, runs commands there, pipes output to stdout"""
443 | 
444 |   instances = u.lookup_instances(limit_to_current_user=True)
445 |   assert instances, f"{u.get_username()} doesn't have an instances to connect to. Use 'ncluster nano'" \
446 |                     f" to bring up a small instance."
447 |   instance = instances[0]
448 |   user_cmd = f"ssh -t -i {u.get_keypair_fn()} -o StrictHostKeyChecking=no " \
449 |              f"{u.get_aws_username(instance)}@{instance.public_ip_address} {user_cmd}"
450 |   os.system(user_cmd)
451 | 
452 | 
453 | def cat(user_cmd): cmd('cat ' + user_cmd)
454 | 
455 | 
456 | def ls_(user_cmd): cmd('ls ' + user_cmd)
457 | 
458 | 
459 | def cleanup_placement_groups(*_args):
460 |   print("Deleting all placement groups")
461 |   # TODO(y): don't delete groups that have currently stopped instances
462 |   client = u.get_ec2_client()
463 |   for group in client.describe_placement_groups().get('PlacementGroups', []):
464 |     name = group['GroupName']
465 |     sys.stdout.write(f"Deleting {name} ... ")
466 |     sys.stdout.flush()
467 |     try: 
468 |       client.delete_placement_group(GroupName=name)
469 |       print("success")
470 |     except Exception as _:
471 |       print("failed")
472 | 
473 | 
474 | # ncluster launch --image_name=dlami23-efa --instance_type=c5.large --name=test
475 | def launch(args_str):
476 |   import argparse
477 |   parser = argparse.ArgumentParser()
478 |   parser.add_argument('--name', type=str, default='ncluster_launch', help="instance name")
479 |   # parser.add_argument('--image_name', type=str, default='')  # default small image
480 |   parser.add_argument('--image_name', type=str, default='Deep Learning AMI (Ubuntu) Version 23.0')
481 |   # can also use --image_name='Deep Learning AMI (Amazon Linux) Version 23.0' # cybertronai01
482 |   parser.add_argument('--instance_type', type=str, default='c5.large', help="type of instance")
483 |   parser.add_argument('--disk_size', type=int, default=0, help="size of disk in GBs. If 0, use default size for the image")
484 |   args = parser.parse_args(shlex.split(args_str))
485 | 
486 |   return ncluster.make_task(**vars(args))
487 | 
488 | 
489 | def fix_default_security_group(_):
490 |   """Allows ncluster and ncluster_nd security groups to exchange traffic with each other."""
491 | 
492 |   def peer(current, other):
493 |     """allow current group to accept all traffic from other group"""
494 | 
495 |     groups = u.get_security_group_dict()
496 |     current_group = groups[current]
497 |     other_group = groups[other]
498 |     response = {}
499 |     for protocol in ['icmp']:
500 |       try:
501 |         rule = {'FromPort': -1,
502 |                 'IpProtocol': protocol,
503 |                 'IpRanges': [],
504 |                 'PrefixListIds': [],
505 |                 'ToPort': -1,
506 |                 'UserIdGroupPairs': [{'GroupId': other_group.id}]}
507 |         response = current_group.authorize_ingress(IpPermissions=[rule])
508 | 
509 |       except Exception as e:
510 |         if response['Error']['Code'] == 'InvalidPermission.Duplicate':
511 |           print("Warning, got " + str(e))
512 |         else:
513 |           assert False, "Failed while authorizing ingress with " + str(e)
514 | 
515 |     for protocol in ['tcp', 'udp']:
516 |       try:
517 |         rule = {'FromPort': 0,
518 |                 'IpProtocol': protocol,
519 |                 'IpRanges': [],
520 |                 'PrefixListIds': [],
521 |                 'ToPort': 65535,
522 |                 'UserIdGroupPairs': [{'GroupId': other_group.id}]}
523 |         response = current_group.authorize_ingress(IpPermissions=[rule])
524 |       except Exception as e:
525 |         if response['Error']['Code'] == 'InvalidPermission.Duplicate':
526 |           print("Warning, got " + str(e))
527 |         else:
528 |           assert False, "Failed while authorizing ingress with " + str(e)
529 | 
530 |   group1 = u.get_security_group_name()
531 |   group2 = u.get_security_group_nd_name()
532 | 
533 |   peer(group1, group2)
534 |   peer(group2, group1)
535 | 
536 | 
537 | def keys(_):
538 |   """runs ssh-keygen if necessary, prints public key."""
539 |   key = util.get_public_key()
540 |   print("Your public key is below. Append all of your your team-members  public keys to NCLUSTER_AUTHORIZED_KEYS env var separated by ; ie \nNCLUSTER_AUTHORIZED_KEYS=<key1>;<key2>;<key3>\n")
541 |   print(key)
542 | 
543 | 
544 | def lookup_image(image_id: str) -> Image:
545 |   """Looks up image from image id like 'ami-0cc96feef8c6bbff3', prints image.name"""
546 |   # could use ec2.images.filter(ImageIds=['ami-0cc96feef8c6bbff3']
547 |   assert image_id.startswith('ami-')
548 |   ec2 = u.get_ec2_resource()
549 |   images = list(ec2.images.filter(ImageIds=[image_id]))
550 |   assert images, f"No images found with id={image_id}"
551 |   assert len(images) == 1, f"Multiple images found with id={image_id}: {','.join(i.name for i in images)}"
552 |   image = [im for im in images if im.id == image_id][0]
553 |   print(image.name)
554 |   return image
555 | 
556 | 
557 | def grow_disks(fragment: str, target_size_gb=500):
558 |     """Grows main disk for given machine to 500GB"""
559 | 
560 |     instance = u.lookup_instance(fragment)
561 |     client = u.get_ec2_client()
562 | 
563 |     volumes = list(instance.volumes.all())
564 |     for vol in volumes:
565 |       volume: Volume = vol
566 |       if volume.size < target_size_gb:
567 |         print("Growing %s to %s" % (volume.id, target_size_gb))
568 |         response = client.modify_volume(VolumeId=volume.id, Size=target_size_gb)
569 |         assert u.is_good_response(response)
570 |       else:
571 |         print(f"Volume {volume.id} is already {volume.size} GB's, skipping")
572 | 
573 | 
574 | def disks(fragment):
575 |   """Print disk information for instance."""
576 |   instances = u.lookup_instances(fragment, states=('running', 'stopped'))
577 |   print(f"{'device':>10s} {'size':>10s} {'type':>10s}({'iops'}) {'id':>30s} ")
578 |   print("-" * 50)
579 |   for instance in instances:
580 |     print()
581 |     print(f"Disks on instance '{u.get_name(instance)}' ({instance.id}, {instance.placement['AvailabilityZone']})")
582 |     for volume in instance.volumes.all():
583 |       device = volume.attachments[0]['Device']
584 |       print(f"{device:>10s} {volume.size:>10d} {volume.volume_type:>10s}({volume.iops}) {volume.id:>30s} {u.get_name(volume)}")
585 |   print("Unattached disks")
586 | 
587 |   ec2 = u.get_ec2_resource()
588 |   for volume in ec2.volumes.all():
589 |     if not volume.attachments:
590 |       print(f"{u.get_name(volume)} {volume.size:>10d} {volume.volume_type:>10s} {volume.id:>30s}")
591 | 
592 | 
593 | def fixkeys(_):
594 |   key_name = u.get_keypair_name()
595 |   pairs = u.get_keypair_dict()
596 |   if key_name not in pairs:
597 |     print(f"Default keypair {key_name} does not exist, returning")
598 |     return
599 |   keypair = pairs[key_name]
600 | 
601 |   print(f"Deleting current user keypair {key_name}")
602 |   ec2 = u.get_ec2_resource()
603 |   instance_list = []
604 |   for instance in ec2.instances.all():
605 |     if instance.state == 'terminated':
606 |       continue
607 |     instance_list.append(instance)
608 |   if instance_list:
609 |     print("Warning, after deleting keypair, the following instances will be no longer accessible:")
610 |     for i in instance_list:
611 |       print(u.get_name(i), i.id)
612 |     answer = input("Proceed? (y/N) ")
613 |   else:
614 |     answer = "y"
615 |   if answer.lower() == 'y':
616 |     keypair_fn = u.get_keypair_fn()
617 |     if os.path.exists(keypair_fn):
618 |       print(f"Deleting local .pem file '{keypair_fn}'")
619 |       os.system(f'sudo rm -f {keypair_fn}')
620 |     print(f"Deleting AWS keypair '{keypair.name}'")
621 |     keypair.delete()
622 | 
623 | 
624 | def efs_sync(_):
625 |   """Starts a daemon to sync local /ncluster/sync with remote ncluster sync."""
626 | 
627 |   print("Syncing local /ncluster/sync to remote /ncluster/sync")
628 |   if not os.path.exists('/ncluster/sync'):
629 |     print("Local /ncluster/sync doesn't exist, creating")
630 |     if not os.path.exists('/ncluster'):
631 |       os.system('sudo mkdir /ncluster')
632 |       os.system('chown `whoami` /ncluster')
633 |     os.system('mkdir /ncluster/sync')
634 | 
635 |   instances = u.lookup_instances(limit_to_current_user=True)
636 |   if instances:
637 |     instance = instances[0]
638 |     print(f"Found {len(instances)} instances owned by {u.get_username()}, using {u.get_name(instance)} for syncing")
639 |   else:
640 |     print(f"Found no instances by {u.get_username()}, Launching t2.nano instance to do the sync.")
641 |     task = ncluster.make_task(name='shell', instance_type='t2.nano')
642 |     instance = task.instance
643 | 
644 |   os.system(f'cd /ncluster/sync && nsync -m {u.get_name(instance)} -d /ncluster/sync')
645 | 
646 | 
647 | def spot_prices(instance_type: str) -> Dict[str, float]:
648 |   """
649 | 
650 |   Print spot instance pricing.
651 |   Args:
652 |     instance_type: AWS instance name. Common names can be shortcuts (p2, p3, c5)
653 | 
654 |   Returns:
655 |     dictionary of zone->price for given instance
656 |   """
657 |   if instance_type == 'p3':
658 |     instance_type = 'p3.16xlarge'
659 |   elif instance_type == 'p3dn':
660 |     instance_type = 'p3dn.24xlarge'
661 |   elif instance_type == 'p2':
662 |     instance_type = 'p2.16xlarge'
663 |   elif instance_type == 'c5':
664 |     instance_type = 'c5.18xlarge'
665 |   elif instance_type == 'c5n':
666 |     instance_type = 'c5n.18xlarge'
667 |   else:
668 |     # expecting exact match
669 |     pass
670 |   product = 'Linux/UNIX (Amazon VPC)'
671 |   result = {}
672 |   client = u.get_ec2_client()
673 |   print(f"Prices for instance {instance_type} on Linux")
674 |   for zone in [z['ZoneName'] for z in client.describe_availability_zones()['AvailabilityZones'] if z['State'] == 'available']:
675 |     try:
676 |       price = client.describe_spot_price_history(InstanceTypes=[instance_type],
677 |                                                  MaxResults=1,
678 |                                                  ProductDescriptions=[product],
679 |                                                  AvailabilityZone=zone)['SpotPriceHistory'][0]['SpotPrice']
680 |       price = float(price)
681 |       print(f"{zone}: {price:.2f}")
682 |       result[zone] = price
683 |     except IndexError as _:
684 |       pass
685 |   return result
686 | 
687 | 
688 | COMMANDS = {
689 |   'ls': ls,
690 |   'ssh': ssh,
691 |   'ssh_': old_ssh,
692 |   'old_ssh': old_ssh,
693 |   'mosh': mosh,
694 |   'kill': kill,
695 |   'reallykill': reallykill,
696 |   'stop': stop,
697 |   'reallystop': reallystop,
698 |   'start': start,
699 |   'efs': efs,
700 |   'cat': cat,
701 |   'ls_': ls_,
702 |   'nano': nano,
703 |   'cmd': cmd,
704 |   '/etc/hosts': etchosts,
705 |   'hosts': etchosts,
706 |   'terminate_tmux': terminate_tmux,
707 |   'cleanup_placement_groups': cleanup_placement_groups,
708 |   'lookup_image': lookup_image,
709 |   'lookup_image_name': lookup_image,
710 |   'reboot': reboot,
711 |   'launch': launch,
712 |   'fix_default_security_group': fix_default_security_group,
713 |   'keys': keys,
714 |   'connect': connect,
715 |   'connectm': connectm,
716 |   'grow_disks': grow_disks,
717 |   'disks': disks,
718 |   'fixkeys': fixkeys,
719 |   'efs_sync': efs_sync,
720 |   'spot_prices': spot_prices,
721 | }
722 | 
723 | 
724 | def main():
725 |   print(f"Region ({u.get_region()}) $USER ({u.get_username()}) account ({u.get_account_number()}:{u.get_account_name()})")
726 |   if len(sys.argv) < 2:
727 |     mode = 'ls'
728 |   else:
729 |     mode = sys.argv[1]
730 |   if mode == 'help':
731 |     for k, v in COMMANDS.items():
732 |       if v.__doc__:
733 |         print(f'{k}\t{v.__doc__}')
734 |       else:
735 |         print(k)
736 |     return
737 |   if mode not in COMMANDS:
738 |     assert False, f"unknown command '{mode}', available commands are {', '.join(str(a) for a in COMMANDS.keys())}"
739 | 
740 |   # for connect commands, don't escape args in order for exact match to work (ncluster connect `exactname`)
741 |   if mode == 'connect' or mode == 'ssh' or mode == 'old_ssh' or mode == 'mosh':
742 |     COMMANDS[mode](' '.join(sys.argv[2:]))  # no escaping
743 |   else:
744 |     COMMANDS[mode](' '.join([shlex.quote(arg) for arg in sys.argv[2:]]))
745 | 
746 | 
747 | if __name__ == '__main__':
748 |   main()
749 | 


--------------------------------------------------------------------------------
/tools/nsync:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Usage:
  4 | #
  5 | # To sync current directory with ~ on target instance
  6 | # sync.py -n ygpubox
  7 | #
  8 | # 
  9 | #
 10 | # forked from original by gdb@openai
 11 | import argparse
 12 | import fcntl
 13 | import logging
 14 | import os
 15 | import select
 16 | import subprocess
 17 | import sys
 18 | 
 19 | from ncluster import aws_util as u
 20 | 
 21 | # In modules, use `logger = logging.getLogger(__name__)`
 22 | 
 23 | parser = argparse.ArgumentParser(description='sync')
 24 | parser.add_argument('-v', '--verbose', action='count', dest='verbosity',
 25 |                     default=0, help='Set verbosity.')
 26 | parser.add_argument('-m', '--machine', type=str, default='', help="name of machine to sync with")
 27 | parser.add_argument('-d', '--directory', type=str, default='.', help="which directory to sync to (default .)")
 28 | args = parser.parse_args()
 29 | 
 30 | logger = logging.getLogger()
 31 | logger.addHandler(logging.StreamHandler(sys.stderr))
 32 | if args.verbosity == 0:
 33 |   logger.setLevel(logging.INFO)
 34 | elif args.verbosity >= 1:
 35 |   logger.setLevel(logging.DEBUG)
 36 | 
 37 | 
 38 | class Error(Exception):
 39 |   pass
 40 | 
 41 | 
 42 | class Resyncd(object):
 43 |   def __init__(self, remote, sync):
 44 |     self.remote = remote
 45 |     self.sync = sync
 46 |     self.counter = 0
 47 | 
 48 |   def run(self):
 49 |     self.resync()
 50 |     sources = [sync.source for sync in self.sync]
 51 |     fswatch = subprocess.Popen(['fswatch'] + sources, stdout=subprocess.PIPE)
 52 |     fl = fcntl.fcntl(fswatch.stdout.fileno(), fcntl.F_GETFL)
 53 |     fcntl.fcntl(fswatch.stdout.fileno(), fcntl.F_SETFL, fl | os.O_NONBLOCK)
 54 |     while True:
 55 |       r, _, _ = select.select([fswatch.stdout], [], [])
 56 |       fswatch_output = r[0].read()
 57 |       output = fswatch_output.decode('ascii')
 58 |       files = output.strip().split("\n")
 59 | 
 60 |       # Ignore emacs swap files
 61 |       files = [f for f in files if '#' not in os.path.basename(f)]
 62 |       if files:
 63 |         print("changed: " + str(files))
 64 |       files = set(files)  # remove duplicates from fswatch_output
 65 |       if not files:
 66 |         continue
 67 | 
 68 |       print("---")
 69 |       print(files)
 70 |       print("---")
 71 |       self.resync()
 72 | 
 73 |   def resync(self):
 74 |     procs = []
 75 |     for sync in self.sync:
 76 |       instances = u.lookup_instances(args.machine, verbose=False, limit_to_current_user=True)
 77 |       assert instances, f"Couldn't find instance {args.machine}"
 78 |       instance = instances[0]
 79 | 
 80 |       print("Syncing with ", u.get_name(instance))
 81 | 
 82 |       command = sync.command(instance)
 83 |       popen = subprocess.Popen(command)
 84 |       procs.append({
 85 |         'popen': popen,
 86 |         'command': command,
 87 |       })
 88 |     # Wait
 89 |     for proc in procs:
 90 |       print(proc["command"])
 91 |       proc['popen'].communicate()
 92 |     for proc in procs:
 93 |       if proc['popen'].returncode != 0:
 94 |         raise Error('Bad returncode from %s: %d', proc['command'], proc['popen'].returncode)
 95 |     logger.info('Resync %d complete', self.counter)
 96 |     self.counter += 1
 97 | 
 98 | 
 99 | class Sync(object):
100 |   # todo: exclude .#sync.py
101 |   excludes = ('*.model', '*.cache', '.picklecache', '*.pyc', '*.gz', '.*', '#*')
102 | 
103 |   def __init__(self, source, dest, modify_window=True, copy_links=False, excludes=()):
104 |     self.source = os.path.expanduser(source)
105 |     self.dest = dest
106 |     self.modify_window = modify_window
107 |     self.copy_links = copy_links
108 |     self.excludes = self.excludes + excludes
109 | 
110 |   def command(self, instance):
111 |     excludes = []
112 |     for exclude in self.excludes:
113 |       excludes += ['--exclude', exclude]
114 | 
115 |     # todo, rename no_strict_checking to ssh_command
116 | 
117 |     keypair_fn = u.get_keypair_fn()
118 |     username = u.get_aws_username(instance)
119 |     ip = instance.public_ip_address
120 | 
121 |     ssh_command = "ssh -i %s -o StrictHostKeyChecking=no" % (keypair_fn,)
122 |     no_strict_checking = ['-arvce', ssh_command]
123 | 
124 |     command = ['rsync'] + no_strict_checking + excludes
125 |     if self.modify_window:
126 |       command += ['--update', '--modify-window=600']
127 |     if self.copy_links:
128 |       command += ['-L']
129 |     command += ['-rv', self.source, username + "@" + ip + ':' + self.dest]
130 |     print("Running ")
131 |     print(command)
132 |     return command
133 | 
134 | 
135 | def main():
136 |   sync = [Sync(source='.', dest=args.directory, copy_links=False), ]
137 | 
138 |   # obtain ssh
139 |   resyncd = Resyncd('asdf', sync)
140 | 
141 |   resyncd.run()
142 |   return 0
143 | 
144 | 
145 | if __name__ == '__main__':
146 |   sys.exit(main())
147 | 


--------------------------------------------------------------------------------