├── yarn-ec2
├── .gitignore
├── LICENSE
├── data
    └── instance.matrix.txt
├── README.md
├── ec2_util.py
├── bootstrap.py
└── yarn_ec2.py


/yarn-ec2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CURR_DIR="$(dirname "$0")"
3 | python  "${CURR_DIR}/yarn_ec2.py" "$@"
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | *~
30 | *.pyc
31 | *_ec2
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 by Contributors
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/data/instance.matrix.txt:
--------------------------------------------------------------------------------
 1 | t2.micro	1	Variable	1	EBS Only	$0.013 per Hour
 2 | t2.small	1	Variable	2	EBS Only	$0.026 per Hour
 3 | t2.medium	2	Variable	4	EBS Only	$0.052 per Hour
 4 | m3.medium	1	3	3.75	1 x 4 SSD	$0.070 per Hour
 5 | m3.large	2	6.5	7.5	1 x 32 SSD	$0.140 per Hour
 6 | m3.xlarge	4	13	15	2 x 40 SSD	$0.280 per Hour
 7 | m3.2xlarge	8	26	30	2 x 80 SSD	$0.560 per Hour
 8 | c4.large	2	8	3.75	EBS Only	$0.116 per Hour
 9 | c4.xlarge	4	16	7.5	EBS Only	$0.232 per Hour
10 | c4.2xlarge	8	31	15	EBS Only	$0.464 per Hour
11 | c4.4xlarge	16	62	30	EBS Only	$0.928 per Hour
12 | c4.8xlarge	36	132	60	EBS Only	$1.856 per Hour
13 | c3.large	2	7	3.75	2 x 16 SSD	$0.105 per Hour
14 | c3.xlarge	4	14	7.5	2 x 40 SSD	$0.210 per Hour
15 | c3.2xlarge	8	28	15	2 x 80 SSD	$0.420 per Hour
16 | c3.4xlarge	16	55	30	2 x 160 SSD	$0.840 per Hour
17 | c3.8xlarge	32	108	60	2 x 320 SSD	$1.680 per Hour
18 | g2.2xlarge	8	26	15	60 SSD	$0.650 per Hour
19 | g2.8xlarge	32	104	60	2 x 120 SSD	$2.600 per Hour
20 | r3.large	2	6.5	15	1 x 32 SSD	$0.175 per Hour
21 | r3.xlarge	4	13	30.5	1 x 80 SSD	$0.350 per Hour
22 | r3.2xlarge	8	26	61	1 x 160 SSD	$0.700 per Hour
23 | r3.4xlarge	16	52	122	1 x 320 SSD	$1.400 per Hour
24 | r3.8xlarge	32	104	244	2 x 320 SSD	$2.800 per Hour
25 | i2.xlarge	4	14	30.5	1 x 800 SSD	$0.853 per Hour
26 | i2.2xlarge	8	27	61	2 x 800 SSD	$1.705 per Hour
27 | i2.4xlarge	16	53	122	4 x 800 SSD	$3.410 per Hour
28 | i2.8xlarge	32	104	244	8 x 800 SSD	$6.820 per Hour
29 | d2.xlarge	4	14	30.5	3 x 2000 HDD	$0.690 per Hour
30 | d2.2xlarge	8	28	61	6 x 2000 HDD	$1.380 per Hour
31 | d2.4xlarge	16	56	122	12 x 2000 HDD	$2.760 per Hour
32 | d2.8xlarge	36	116	244	24 x 2000 HDD	$5.520 per Hour
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | YARN EC2
 2 | ========
 3 | This is a script to help you quickly setup a dynamic YARN cluster on EC2.
 4 | It adapts a cloud workflow to use S3 for distributed data storage and YARN for computing.
 5 | 
 6 | ***Features***
 7 | - Dynamically add or remove slave nodes from the cluster.
 8 | - Customized installation of packages.
 9 | 
10 | How to Use
11 | ----------
12 | To start a cluster, the script follows two steps: (1) start master machine; (2) add slaves.
13 | This two step procedure allows you to add and remove slaves on the fly.
14 | 
15 | - Start your master machine
16 |   - ```./yarn-ec2 -k mykey -i mypem.pem launch cluster-name ```
17 | - Add slaves to the cluster
18 |   - ```./yarn-ec2 -k mykey -i mypem.pem -s nslave addslave cluster-name ```
19 | - Alternatively, you can add spot instance to the cluster
20 |   - ```./yarn-ec2 -k mykey -i mypem.pem -s nslave addspot cluster-name```
21 |   - On demand price is used by default, you can change it by ```--spot-price``` option.
22 | - Both addslave and addspot will send request to EC2 and may not be fullfilled immediately
23 |   - They will connect to the master node after one bootstrap (which takes around 1 minimute).
24 |   - You can browse the yarn resource manager for the status of the cluster.
25 | - Shutdown the machines manually in ec2 panel
26 | 
27 | Distributed Storage
28 | -------------------
29 | Because the cluster is dynamic, all the nodes are only used as computing nodes.
30 | HDFS is only started on the master machine for temp code transfer.
31 | Normally S3 is used instead for distributed storage.
32 | 
33 | 
34 | Customize Installation
35 | ----------------------
36 | You can modify ```custom_master_install``` and ```custom_all_nodes_install``` in [bootstrap.py](https://github.com/tqchen/yarn-ec2/blob/master/bootstrap.py#L21)
37 | to add the packages you like to install on each machine.
38 | 
39 | 
40 | Restart Master Machine
41 | ----------------------
42 | In case you stopped the master and restart it on the EC2. There is no need to do the launch step again.
43 | Instead, log into the master machine, and run ```startup.sh``` on the home folder.
44 | After the startup is finished, you can continue with the steps of adding slaves.
45 | 
46 | Acknowledgement
47 | ---------------
48 | Part of yarn-ec2 is adopted from [spark-ec2](https://github.com/amplab/spark-ec2) script.
49 | 
50 | Note on Implementation
51 | ----------------------
52 | Most existing cluster launch script follows a start and deploy way.
53 | - First start all the nodes, copy the master's credentials to the slaves.
54 | - Deploy the slaves by using ssh or pssh command from master.
55 | 
56 | These scripts requires master to be aware of the slaves and are hard to dynamically add or remove nodes.
57 | yarn-ec2 uses another way, where the master does not need to be aware of slaves beforehand.
58 | - First start the master and listen requests from slaves.
59 | - When a slave get started, it runs the bootstrap script, install the dependencies and report to master.
60 |   - The YARN will then dynamically add the slave to the cluster.
61 | - When a slave get removed, the master will detect the event with cluster health check, and remove it from the cluster.
62 | 


--------------------------------------------------------------------------------
/ec2_util.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import boto
 20 | from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
 21 | from boto import ec2
 22 | import sys
 23 | import string
 24 | import time
 25 | 
 26 | # Get number of local disks available for a given EC2 instance type.
 27 | def get_num_disks(instance):
 28 |     # From http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html
 29 |     # Updated 2014-6-20
 30 |     disks_by_instance = {
 31 |         "m1.small":    1,
 32 |         "m1.medium":   1,
 33 |         "m1.large":    2,
 34 |         "m1.xlarge":   4,
 35 |         "t1.micro":    1,
 36 |         "c1.medium":   1,
 37 |         "c1.xlarge":   4,
 38 |         "m2.xlarge":   1,
 39 |         "m2.2xlarge":  1,
 40 |         "m2.4xlarge":  2,
 41 |         "cc1.4xlarge": 2,
 42 |         "cc2.8xlarge": 4,
 43 |         "cg1.4xlarge": 2,
 44 |         "hs1.8xlarge": 24,
 45 |         "cr1.8xlarge": 2,
 46 |         "hi1.4xlarge": 2,
 47 |         "m3.medium":   1,
 48 |         "m3.large":    1,
 49 |         "m3.xlarge":   2,
 50 |         "m3.2xlarge":  2,
 51 |         "i2.xlarge":   1,
 52 |         "i2.2xlarge":  2,
 53 |         "i2.4xlarge":  4,
 54 |         "i2.8xlarge":  8,
 55 |         "c3.large":    2,
 56 |         "c3.xlarge":   2,
 57 |         "c3.2xlarge":  2,
 58 |         "c3.4xlarge":  2,
 59 |         "c3.8xlarge":  2,
 60 |         "r3.large":    1,
 61 |         "r3.xlarge":   1,
 62 |         "r3.2xlarge":  1,
 63 |         "r3.4xlarge":  1,
 64 |         "r3.8xlarge":  2,
 65 |         "g2.2xlarge":  1,
 66 |         "g2.8xlarge":  2,
 67 |         "t1.micro":    0
 68 |     }
 69 |     if instance in disks_by_instance:
 70 |         return disks_by_instance[instance]
 71 |     else:
 72 |         print >> sys.stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1"
 73 |                               % instance)
 74 |         return 1
 75 | 
 76 | def get_instance_type(instance):
 77 |     instance_types = {
 78 |         "m1.small":    "pvm",
 79 |         "m1.medium":   "pvm",
 80 |         "m1.large":    "pvm",
 81 |         "m1.xlarge":   "pvm",
 82 |         "t1.micro":    "pvm",
 83 |         "c1.medium":   "pvm",
 84 |         "c1.xlarge":   "pvm",
 85 |         "m2.xlarge":   "pvm",
 86 |         "m2.2xlarge":  "pvm",
 87 |         "m2.4xlarge":  "pvm",
 88 |         "cc1.4xlarge": "hvm",
 89 |         "cc2.8xlarge": "hvm",
 90 |         "cg1.4xlarge": "hvm",
 91 |         "hs1.8xlarge": "pvm",
 92 |         "hi1.4xlarge": "pvm",
 93 |         "m3.medium":   "hvm",
 94 |         "m3.large":    "hvm",
 95 |         "m3.xlarge":   "hvm",
 96 |         "m3.2xlarge":  "hvm",
 97 |         "cr1.8xlarge": "hvm",
 98 |         "i2.xlarge":   "hvm",
 99 |         "i2.2xlarge":  "hvm",
100 |         "i2.4xlarge":  "hvm",
101 |         "i2.8xlarge":  "hvm",
102 |         "c3.large":    "pvm",
103 |         "c3.xlarge":   "pvm",
104 |         "c3.2xlarge":  "pvm",
105 |         "c3.4xlarge":  "pvm",
106 |         "c3.8xlarge":  "pvm",
107 |         "r3.large":    "hvm",
108 |         "g2.2xlarge":  "hvm",
109 |         "g2.8xlarge":  "hvm",
110 |         "r3.xlarge":   "hvm",
111 |         "r3.2xlarge":  "hvm",
112 |         "r3.4xlarge":  "hvm",
113 |         "r3.8xlarge":  "hvm",
114 |         "t2.micro":    "hvm",
115 |         "t2.small":    "hvm",
116 |         "t2.medium":   "hvm"
117 |     }
118 |     if instance in instance_types:
119 |         return instance_types[instance]
120 |     else:
121 |         print >> sys.stderr,\
122 |             "Don't recognize %s, assuming type is pvm" % instance
123 |         return 'pvm'
124 | 
125 | # Wait for a set of launched instances to exit the "pending" state
126 | # (i.e. either to start running or to fail and be terminated)
127 | def wait_for_instances(conn, instances):
128 |     while True:
129 |         for i in instances:
130 |             i.update()
131 |         status = conn.get_all_instance_status(instance_ids = [i.id for i in instances])
132 |         if len([i for i in instances if i.state == 'pending']) > 0:
133 |             time.sleep(5)
134 |         elif len([i for i in status if i.system_status.status == 'initializing']) > 0:
135 |             time.sleep(5)
136 |         else:
137 |             return
138 | 
139 | # Get the EC2 security group of the given name, creating it if it doesn't exist
140 | def get_or_make_group(conn, name, make_if_not_exist = True):
141 |     groups = conn.get_all_security_groups()
142 |     group = [g for g in groups if g.name == name]
143 |     if len(group) > 0:
144 |         return group[0]
145 |     else:
146 |         if not make_if_not_exist:
147 |             print >> sys.stderr, "ERROR: Could not find any existing security group"
148 |             sys.exit(1)
149 |         print "Creating security group " + name
150 |         return conn.create_security_group(name, "MODE EC2 group")
151 | 
152 | # Check whether a given EC2 instance object is in a state we consider active,
153 | # i.e. not terminating or terminated. We count both stopping and stopped as
154 | # active since we can restart stopped clusters.
155 | def is_active(instance):
156 |     return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
157 | 
158 | # Attempt to resolve an appropriate AMI given the architecture and
159 | # region of the request.
160 | # Information regarding Amazon Linux AMI instance type was update on 2014-6-20:
161 | # http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
162 | def get_block_device(instance_type, ebs_vol_size):
163 |     block_map = BlockDeviceMapping()
164 | 
165 |     if ebs_vol_size > 0:
166 |         device = EBSBlockDeviceType()
167 |         device.size = ebs_vol_size
168 |         device.delete_on_termination = True
169 |         block_map["/dev/sdv"] = device
170 | 
171 |     for i in range(get_num_disks(instance_type)):
172 |         dev = BlockDeviceType()
173 |         dev.ephemeral_name = 'ephemeral%d' % i
174 |         # The first ephemeral drive is /dev/sdb.
175 |         name = '/dev/sd' + string.letters[i + 1]
176 |         block_map[name] = dev
177 | 
178 |     return block_map
179 | 
180 | 
181 | # Get the EC2 instances in an existing cluster if available.
182 | # Returns a tuple of lists of EC2 instance objects for the masters and slaves
183 | def get_existing_cluster(conn, cluster_name, die_on_error=True):
184 |     print "Searching for existing cluster " + cluster_name + "..."
185 |     reservations = conn.get_all_instances()
186 |     master_nodes = []
187 |     slave_nodes = []
188 |     for res in reservations:
189 |         active = [i for i in res.instances if is_active(i)]
190 |         for inst in active:
191 |             group_names = [g.name for g in inst.groups]
192 |             if group_names == [cluster_name + "-master"]:
193 |                 master_nodes.append(inst)
194 |             elif group_names == [cluster_name + "-slaves"]:
195 |                 slave_nodes.append(inst)
196 |     if any((master_nodes, slave_nodes)):
197 |         print ("Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes)))
198 |     if master_nodes != [] or not die_on_error:
199 |         return (master_nodes, slave_nodes)
200 |     else:
201 |         if master_nodes == [] and slave_nodes != []:
202 |             print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
203 |         else:
204 |             print >> sys.stderr, "ERROR: Could not find any existing cluster"
205 |         sys.exit(1)
206 | 


--------------------------------------------------------------------------------
/bootstrap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | """
  4 | script to install all the necessary things
  5 | for working on a linux machine with nothing
  6 | 
  7 | Installing minimum dependencies
  8 | """
  9 | import sys
 10 | import os
 11 | import logging
 12 | import subprocess
 13 | import xml.etree.ElementTree as ElementTree
 14 | import xml.dom.minidom as minidom
 15 | import socket
 16 | import time
 17 | import pwd
 18 | 
 19 | ###---------------------------------------------------##
 20 | #  Configuration Section, will be modified by script  #
 21 | ###---------------------------------------------------##
 22 | node_apt_packages = [
 23 |     'emacs',
 24 |     'git',
 25 |     'g++',
 26 |     'make',
 27 |     'python-numpy',
 28 |     'libprotobuf-dev',
 29 |     'libcurl4-openssl-dev']
 30 | 
 31 | # master only packages
 32 | master_apt_packages = [
 33 |     'protobuf-compiler']
 34 | 
 35 | # List of r packages to be installed in master
 36 | master_r_packages = [
 37 |     'r-base-dev',
 38 |     'r-base',
 39 |     'r-cran-statmod',
 40 |     'r-cran-RCurl',
 41 |     'r-cran-rjson'
 42 | ]
 43 | 
 44 | # download link of hadoop.
 45 | hadoop_url = 'http://apache.claz.org/hadoop/common/hadoop-2.8.0/hadoop-2.8.0.tar.gz'
 46 | hadoop_dir = 'hadoop-2.8.0'
 47 | 
 48 | # customized installation script.
 49 | # See optional installation scripts for options.
 50 | def custom_master_install():
 51 |     #install_spark()
 52 |     #install_r()
 53 |     pass
 54 | 
 55 | # customized installation script for all nodes.
 56 | def custom_all_nodes_install():
 57 |     install_gcc()
 58 |     pass
 59 | 
 60 | ###---------------------------------------------------##
 61 | #  Automatically set by script                        #
 62 | ###---------------------------------------------------##
 63 | USER_NAME = 'ubuntu'
 64 | # setup variables
 65 | MASTER = os.getenv('MY_MASTER_DNS', '')
 66 | # node type the type of current node
 67 | NODE_TYPE = os.getenv('MY_NODE_TYPE', 'm3.xlarge')
 68 | NODE_VMEM = int(os.getenv('MY_NODE_VMEM', str(1024*15)))
 69 | NODE_VCPU = int(os.getenv('MY_NODE_VCPU', '4'))
 70 | AWS_ID = os.getenv('AWS_ACCESS_KEY_ID', 'undefined')
 71 | AWS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', 'undefined')
 72 | JAVA_HOME = os.getenv('JAVA_HOME')
 73 | HADOOP_HOME = os.getenv('HADOOP_HOME')
 74 | DISK_LIST = [('xvd' + chr(ord('b') + i)) for i in range(10)]
 75 | ENVIRON = os.environ.copy()
 76 | 
 77 | ###--------------------------------##
 78 | #  Optional installation scripts.  #
 79 | ###--------------------------------##
 80 | def install_r():
 81 |     if master_r_packages:
 82 |         sudo("apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9")
 83 |         sudo("echo deb https://cran.r-project.org/bin/linux/ubuntu trusty/ >>/etc/apt/sources.list")
 84 |         sudo('apt-get -y update')
 85 |         sudo('apt-get -y install %s' % (' '.join(master_r_packages)))
 86 | 
 87 | 
 88 | def install_spark():
 89 |     run('wget https://www.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.7.tgz')
 90 |     run('tar xf spark-2.1.1-bin-hadoop2.7.tgz')
 91 |     run('rm -rf spark-2.1.1-bin-hadoop2.7.tgz')
 92 |     with open('.bashrc', 'a') as fo:
 93 |         fo.write('\nexport PATH=${PATH}:spark-2.1.1-bin-hadoop2.7\n')
 94 | 
 95 | 
 96 | def install_xgboost():
 97 |     run('git clone --recursive https://github.com/dmlc/xgboost')
 98 |     run('cd xgboost; cp make/config.mk .; echo USE_S3=1 >> config.mk; make -j4')
 99 | 
100 | ### Script section ###
101 | def run(cmd):
102 |     try:
103 |         print cmd
104 |         logging.info(cmd)
105 |         proc = subprocess.Popen(cmd, shell=True, env = ENVIRON,
106 |                                 stdout=subprocess.PIPE, stderr = subprocess.PIPE)
107 |         out, err = proc.communicate()
108 |         retcode = proc.poll()
109 |         if retcode != 0:
110 |             logging.error('Command %s returns %d' % (cmd,retcode))
111 |             logging.error(out)
112 |             logging.error(err)
113 |         else:
114 |             print out
115 |     except Exception as e:
116 |         print(str(e))
117 |         logging.error('Exception running: %s' % cmd)
118 |         logging.error(str(e))
119 |         pass
120 | 
121 | def sudo(cmd):
122 |     run('sudo %s' % cmd)
123 | 
124 | ### Installation helpers ###
125 | def install_packages(pkgs):
126 |     sudo('apt-get -y update')
127 |     sudo('apt-get -y install %s' % (' '.join(pkgs)))
128 | 
129 | # install g++4.9, needed for regex match.
130 | def install_gcc():
131 |     sudo('add-apt-repository -y ppa:ubuntu-toolchain-r/test')
132 |     sudo('apt-get -y update')
133 |     sudo('apt-get -y install g++-4.9')
134 | 
135 | def install_java():
136 |     """
137 |     install java and setup environment variables
138 |     Returns environment variables that needs to be exported
139 |     """
140 |     if not os.path.exists('jdk1.8.0_131'):
141 |         run('wget --no-check-certificate --no-cookies'\
142 |                 ' --header \"Cookie: oraclelicense=accept-securebackup-cookie\"'\
143 |                 ' http://download.oracle.com/otn-pub/java/jdk/8u131-b11/d54c1d3a095b4ff2b6607d096fa80163/jdk-8u131-linux-x64.tar.gz')
144 |         run('tar xf jdk-8u131-linux-x64.tar.gz')
145 |         run('rm -f jdk-8u131-linux-x64.tar.gz')
146 |     global JAVA_HOME
147 |     if JAVA_HOME is None:
148 |         JAVA_HOME = os.path.abspath('jdk1.8.0_131')
149 |     return [('JAVA_HOME', JAVA_HOME)]
150 | 
151 | 
152 | def install_hadoop(is_master):
153 |     def update_site(fname, rmap):
154 |         """
155 |         update the site script
156 |         """
157 |         try:
158 |             tree = ElementTree.parse(fname)
159 |             root = tree.getroot()
160 |         except Exception:
161 |             cfg = ElementTree.Element("configuration")
162 |             tree = ElementTree.ElementTree(cfg)
163 |             root = tree.getroot()
164 |         rset = set()
165 |         for prop in root.getiterator('property'):
166 |             prop = dict((p.tag, p) for p in prop)
167 |             name = prop['name'].text.strip()
168 |             if name in rmap:
169 |                 prop['value'].text = str(rmap[name])
170 |                 rset.add(name)
171 |         for name, text in rmap.iteritems():
172 |             if name in rset:
173 |                 continue
174 |             prop = ElementTree.SubElement(root, 'property')
175 |             ElementTree.SubElement(prop, 'name').text = name
176 |             ElementTree.SubElement(prop, 'value').text = str(text)
177 |         rough_string = ElementTree.tostring(root, 'utf-8')
178 |         reparsed = minidom.parseString(rough_string)
179 |         pretty = reparsed.toprettyxml(indent='\t')
180 |         fo = open(fname, 'w')
181 |         fo.write(pretty)
182 |         fo.close()
183 | 
184 |     def setup_hadoop_site(master, hadoop_dir, hdfs_dir, vcpu, vmem):
185 |         """
186 |         setup hadoop side given the parameters
187 | 
188 |         Parameters
189 |         ----------
190 |         master: the dns to master uri
191 |         hadoop_dir: the directory to store temp files
192 |         hdfs_dir: the directories for hdfs
193 |         vcpu: the number of cpus current machine have
194 |         vmem: the memory(MB) current machine have
195 |         """
196 |         if vmem < 4 * 1024:
197 |             reserved_ram = 256
198 |         elif vmem < 8 * 1024:
199 |             reserved_ram = 1 * 1024
200 |         elif vmem < 24 * 1024 :
201 |             reserved_ram = 2 * 1024
202 |         elif vmem < 48 * 1024:
203 |             reserved_ram = 2 * 1024
204 |         elif vmem < 64 * 1024:
205 |             reserved_ram = 6 * 1024
206 |         else:
207 |             reserved_ram = 8 * 1024
208 |         ram_per_container = (vmem - reserved_ram) / vcpu
209 | 
210 |         if is_master:
211 |             vcpu = vcpu - 2
212 | 
213 |         tmp_dir = hadoop_dir[0]
214 |         core_site = {
215 |             'fs.defaultFS': 'hdfs://%s:9000/' % master,
216 |             'fs.s3n.impl': 'org.apache.hadoop.fs.s3native.NativeS3FileSystem',
217 |             'hadoop.tmp.dir': tmp_dir
218 |         }
219 |         if AWS_ID != 'undefined':
220 |             core_site['fs.s3n.awsAccessKeyId'] = AWS_ID
221 |             core_site['fs.s3n.awsSecretAccessKey'] = AWS_KEY
222 | 
223 |         update_site('%s/etc/hadoop/core-site.xml' % HADOOP_HOME, core_site)
224 |         hdfs_site = {
225 |             'dfs.data.dir': ','.join(['%s/data' % d for d in hdfs_dir]),
226 |             'dfs.permissions': 'false',
227 |             'dfs.replication': '1'
228 |         }
229 |         update_site('%s/etc/hadoop/hdfs-site.xml' % HADOOP_HOME, hdfs_site)
230 |         yarn_site = {
231 |             'yarn.resourcemanager.resource-tracker.address': '%s:8025' % master,
232 |             'yarn.resourcemanager.scheduler.address': '%s:8030' % master,
233 |             'yarn.resourcemanager.address': '%s:8032' % master,
234 |             'yarn.scheduler.minimum-allocation-mb': 512,
235 |             'yarn.scheduler.maximum-allocation-mb': 640000,
236 |             'yarn.scheduler.minimum-allocation-vcores': 1,
237 |             'yarn.scheduler.maximum-allocation-vcores': 32,
238 |             'yarn.nodemanager.resource.memory-mb': vcpu * ram_per_container,
239 |             'yarn.nodemanager.resource.cpu-vcores': vcpu,
240 |             'yarn.log-aggregation-enable': 'true',
241 |             'yarn.nodemanager.vmem-check-enabled': 'false',
242 |             'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
243 |             'yarn.nodemanager.aux-services.mapreduce.shuffle.class': 'org.apache.hadoop.mapred.ShuffleHandler',
244 |             'yarn.nodemanager.remote-app-log-dir': os.path.join(tmp_dir, 'logs'),
245 |             'yarn.nodemanager.log-dirs': os.path.join(tmp_dir, 'userlogs'),
246 |             'yarn.nodemanager.local-dirs': ','.join(['%s/yarn/nm-local-dir' % d for d in hadoop_dir])
247 |         }
248 |         update_site('%s/etc/hadoop/yarn-site.xml' % HADOOP_HOME, yarn_site)
249 |         mapred_site = {
250 |             'mapreduce.application.classpath' : ':'.join(['$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*',
251 |                                                           '$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*',
252 |                                                           '$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*']),
253 |             'yarn.app.mapreduce.am.resource.mb': 2 * ram_per_container,
254 |             'yarn.app.mapreduce.am.command-opts': '-Xmx%dm' % int(0.8 * 2 * ram_per_container),
255 |             'mapreduce.framework.name': 'yarn',
256 |             'mapreduce.map.cpu.vcores': 1,
257 |             'mapreduce.map.memory.mb': ram_per_container,
258 |             'mapreduce.map.java.opts': '-Xmx%dm' % int(0.8 * ram_per_container),
259 |             'mapreduce.reduce.cpu.vcores': 1,
260 |             'mapreduce.reduce.memory.mb': 2 * ram_per_container,
261 |             'mapreduce.reduce.java.opts': '-Xmx%dm' % int(0.8 * ram_per_container)
262 |         }
263 |         update_site('%s/etc/hadoop/mapred-site.xml' % HADOOP_HOME, mapred_site)
264 |         capacity_site = {
265 |             'yarn.scheduler.capacity.resource-calculator': 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
266 |         }
267 |         update_site('%s/etc/hadoop/capacity-scheduler.xml' % HADOOP_HOME, capacity_site)
268 |         fo = open('%s/etc/hadoop/hadoop-env.sh' % HADOOP_HOME, 'w')
269 |         fo.write('export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HADOOP_PREFIX/share/hadoop/tools/lib/*\n')
270 |         fo.write('export HADOOP_LOG_DIR=%s/log\n' % tmp_dir)
271 |         fo.write('export YARN_LOG_DIR=%s/log\n' % tmp_dir)
272 |         fo.write('export JAVA_HOME=\"%s\"\n' % JAVA_HOME)
273 |         fo.close()
274 |         fo = open('%s/etc/hadoop/slaves' % HADOOP_HOME, 'w')
275 |         fo.write(master + '\n')
276 |         fo.close()
277 | 
278 |     def run_install():
279 |         if not os.path.exists('hadoop-2.8.0'):
280 |             run('wget %s' % hadoop_url)
281 |             run('tar xf hadoop-2.8.0.tar.gz')
282 |             run('rm -f hadoop-2.8.0.tar.gz')
283 |             global HADOOP_HOME
284 |         if HADOOP_HOME is None:
285 |             HADOOP_HOME = os.path.abspath('hadoop-2.8.0')
286 |         env = [('HADOOP_HOME', HADOOP_HOME)]
287 |         env += [('HADOOP_PREFIX', HADOOP_HOME)]
288 |         env += [('HADOOP_MAPRED_HOME', HADOOP_HOME)]
289 |         env += [('HADOOP_COMMON_HOME', HADOOP_HOME)]
290 |         env += [('HADOOP_HDFS_HOME', HADOOP_HOME)]
291 |         env += [('YARN_HOME', HADOOP_HOME)]
292 |         env += [('YARN_CONF_DIR', '%s/etc/hadoop' % HADOOP_HOME)]
293 |         env += [('HADOOP_CONF_DIR', '%s/etc/hadoop' % HADOOP_HOME)]
294 |         disks = ['/disk/%s' % d for d in DISK_LIST if os.path.exists('/dev/%s' % d)]
295 |         setup_hadoop_site(MASTER,
296 |                           ['%s/hadoop' % d for d in disks],
297 |                           ['%s/hadoop/dfs' % d for d in disks],
298 |                           NODE_VCPU, NODE_VMEM)
299 |         return env
300 | 
301 |     return run_install()
302 | 
303 | def regsshkey(fname):
304 |     for dns in (open(fname).readlines() + ['localhost', '0.0.0.0']):
305 |         try:
306 |             run('ssh-keygen -R %s' % dns.strip())
307 |         except:
308 |             pass
309 |         run('ssh-keyscan %s >> ~/.ssh/known_hosts' % dns.strip())
310 | 
311 | # main script to install all dependencies
312 | def install_main(is_master):
313 |     if is_master:
314 |         install_packages(master_apt_packages + node_apt_packages)
315 |     else:
316 |         install_packages(node_apt_packages)
317 | 
318 |     env = []
319 |     env += install_java()
320 |     env += install_hadoop(is_master)
321 |     path = ['$HADOOP_HOME/bin', '$HADOOP_HOME/sbin', '$JAVA_HOME/bin']
322 |     env += [('LD_LIBRARY_PATH', '$HADOOP_HOME/native/lib')]
323 |     env += [('LD_LIBRARY_PATH', '${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server')]
324 |     env += [('LD_LIBRARY_PATH', '${LD_LIBRARY_PATH}:/usr/local/lib')]
325 |     env += [('LIBHDFS_OPTS', '--Xmx128m')]
326 |     env += [('MY_MASTER_DNS', MASTER)]
327 |     env += [('MY_NODE_TYPE', NODE_TYPE)]
328 |     env += [('MY_NODE_VMEM', str(NODE_VMEM))]
329 |     env += [('MY_NODE_VCPU', str(NODE_VCPU))]
330 |     if AWS_ID != 'undefined':
331 |         env += [('AWS_ACCESS_KEY_ID', AWS_ID)]
332 |     if AWS_KEY != 'undefined':
333 |         env += [('AWS_SECRET_ACCESS_KEY', AWS_KEY)]
334 |     # setup environments
335 |     fo = open('.hadoop_env', 'w')
336 |     for k, v in env:
337 |         fo.write('export %s=%s\n' % (k,v))
338 |         ENVIRON[k] = v
339 |     fo.write('export PATH=$PATH:%s\n' % (':'.join(path)))
340 |     fo.write('export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib\n')
341 |     fo.close()
342 |     for l in open('.bashrc'):
343 |         if l.find('.hadoop_env') != -1:
344 |             return
345 |     run('echo source ~/.hadoop_env >> ~/.bashrc')
346 |     # allow ssh, if they already share the key.
347 |     key_setup = """
348 |         [ -f ~/.ssh/id_rsa ] ||
349 |             (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
350 |              cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)
351 |     """
352 |     run(key_setup)
353 |     regsshkey('%s/etc/hadoop/slaves' % HADOOP_HOME)
354 |     # end of instalation.
355 | 
356 | # Make startup script for bulding
357 | def make_startup_script(is_master):
358 |     assert JAVA_HOME is not None
359 |     assert HADOOP_HOME is not None
360 |     assert NODE_VCPU is not None
361 |     assert NODE_VMEM is not None
362 |     disks = []
363 |     cmds = []
364 | 
365 |     if is_master:
366 |         cmds.append('$HADOOP_HOME/sbin/stop-all.sh')
367 | 
368 |     for d in DISK_LIST:
369 |         if os.path.exists('/dev/%s' % d):
370 |             cmds.append('sudo umount /dev/%s' % d)
371 |             cmds.append('sudo mkfs -t ext4 /dev/%s' % d)
372 |             cmds.append('sudo mkdir -p /disk/%s' % d)
373 |             cmds.append('sudo mount /dev/%s /disk/%s' % (d, d))
374 |             disks.append('/disk/%s' % d)
375 | 
376 |     for d in disks:
377 |         cmds.append('sudo mkdir -p %s/hadoop' %d)
378 |         cmds.append('sudo chown ubuntu:ubuntu %s/hadoop' % d)
379 |         cmds.append('sudo mkdir -p %s/tmp' %d)
380 |         cmds.append('sudo chown ubuntu:ubuntu %s/tmp' % d)
381 |         cmds.append('rm -rf %s/hadoop/dfs' % d)
382 |         cmds.append('mkdir %s/hadoop/dfs' % d)
383 |         cmds.append('mkdir %s/hadoop/dfs/name' % d)
384 |         cmds.append('mkdir %s/hadoop/dfs/data' % d)
385 | 
386 |     # run command
387 |     if is_master:
388 |         cmds.append('$HADOOP_HOME/bin/hadoop namenode -format')
389 |         cmds.append('$HADOOP_HOME/sbin/start-all.sh')
390 |     else:
391 |         cmds.append('export HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec &&'\
392 |                 ' $HADOOP_HOME/sbin/yarn-daemon.sh --config $HADOOP_HOME/etc/hadoop start nodemanager')
393 |     with open('startup.sh', 'w') as fo:
394 |         fo.write('#!/bin/bash\n')
395 |         fo.write('set -v\n')
396 |         fo.write('\n'.join(cmds))
397 |     run('chmod +x startup.sh')
398 |     run('./startup.sh')
399 | 
400 | 
401 | def main():
402 |     global MASTER
403 |     logging.basicConfig(filename = 'bootstrap.log', level = logging.INFO,
404 |                         format='%(asctime)s %(levelname)s %(message)s')
405 |     if MASTER == '':
406 |         is_master = True
407 |         MASTER = socket.getfqdn()
408 |         logging.info('assuming master is myself as %s' % MASTER)
409 |     else:
410 |         is_master = socket.getfqdn() == MASTER
411 |     tstart = time.time()
412 |     install_main(is_master)
413 |     tmid = time.time()
414 |     logging.info('installation finishes in %g secs' % (tmid - tstart))
415 |     make_startup_script(is_master)
416 |     ENVIRON['HADOOP_HOME'] = HADOOP_HOME
417 |     ENVIRON['JAVA_HOME'] = JAVA_HOME
418 |     tend = time.time()
419 |     if is_master:
420 |         custom_master_install()
421 |     custom_all_nodes_install()
422 |     logging.info('boostrap finishes in %g secs' % (tend - tmid))
423 |     logging.info('all finishes in %g secs' % (tend - tstart))
424 | 
425 | if __name__ == '__main__':
426 |     pw_record = pwd.getpwnam(USER_NAME)
427 |     user_name = pw_record.pw_name
428 |     user_home_dir = pw_record.pw_dir
429 |     user_uid = pw_record.pw_uid
430 |     user_gid = pw_record.pw_gid
431 |     env = os.environ.copy()
432 |     cwd = user_home_dir
433 |     ENVIRON['HOME'] = user_home_dir
434 |     os.setgid(user_gid)
435 |     os.setuid(user_uid)
436 |     os.chdir(user_home_dir)
437 |     main()
438 | 


--------------------------------------------------------------------------------
/yarn_ec2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import logging
  4 | import os
  5 | import random
  6 | import string
  7 | import subprocess
  8 | import sys
  9 | from sys import stderr
 10 | import time
 11 | from boto import ec2
 12 | from optparse import OptionParser
 13 | import ec2_util
 14 | 
 15 | class UsageError(Exception):
 16 |     pass
 17 | 
 18 | # Configure and parse our command-line arguments
 19 | def parse_args():
 20 |     parser = OptionParser(
 21 |         usage="mode-ec2 [options] <action> <cluster_name>"
 22 |         + "\n\n<action> can be: launch, addslave, addspot, login, get-master, forward-port",
 23 |         add_help_option=False)
 24 |     parser.add_option(
 25 |         "-h", "--help", action="help",
 26 |         help="Show this help message and exit")
 27 |     parser.add_option(
 28 |         "-s", "--slaves", type="int", default=1,
 29 |         help="Number of slaves to launch (default: 1)")
 30 |     parser.add_option(
 31 |         "-w", "--wait", type="int", default=120,
 32 |         help="Seconds to wait for nodes to start (default: 120)")
 33 |     parser.add_option(
 34 |         "-k", "--key-pair",
 35 |         help="Key pair to use on instances")
 36 |     parser.add_option(
 37 |         "-i", "--identity-file",
 38 |         help="SSH private key file to use for logging into instances")
 39 |     parser.add_option(
 40 |         "-t", "--instance-type", default="c3.2xlarge",
 41 |         help="Type of instance to launch (default: m3.xlarge). " +
 42 |              "WARNING: must be 64-bit; small instances won't work")
 43 |     parser.add_option(
 44 |         "-r", "--region", default="us-west-2",
 45 |         help="EC2 region zone to launch instances in")
 46 |     parser.add_option(
 47 |         "-z", "--zone", default="",
 48 |         help="Availability zone to launch instances in, or 'all' to spread " +
 49 |              "slaves across multiple (an additional $0.01/Gb for bandwidth" +
 50 |              "between zones applies)")
 51 |     parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
 52 |     parser.add_option(
 53 |         "--include-aws-key", default=False,
 54 |         help=("Whether include aws key information in bootstrap script," +
 55 |               " this can be very dangerous as boostrap script is not encrypted"))
 56 |     parser.add_option(
 57 |         "--spot-price", metavar="PRICE", type="float",
 58 |         help="If specified, launch slaves as spot instances with the given " +
 59 |              "maximum price (in dollars)")
 60 |     parser.add_option(
 61 |         "-u", "--user", default="ubuntu",
 62 |         help="The SSH user you want to connect as (default: root)")
 63 |     parser.add_option(
 64 |         "--delete-groups", action="store_true", default=False,
 65 |         help="When destroying a cluster, delete the security groups that were created")
 66 | 
 67 |     (opts, args) = parser.parse_args()
 68 |     if len(args) != 2:
 69 |         parser.print_help()
 70 |         sys.exit(1)
 71 |     action, cluster_name = args
 72 |     opts.action = action
 73 |     opts.cluster_name = cluster_name
 74 |     # Boto config check
 75 |     # http://boto.cloudhackers.com/en/latest/boto_config_tut.html
 76 |     home_dir = os.getenv('HOME')
 77 |     if home_dir is None or not os.path.isfile(home_dir + '/.boto'):
 78 |         if not os.path.isfile('/etc/boto.cfg'):
 79 |             if os.getenv('AWS_ACCESS_KEY_ID') is None:
 80 |                 print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
 81 |                                   "must be set")
 82 |                 sys.exit(1)
 83 |             if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
 84 |                 print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
 85 |                                   "must be set")
 86 |                 sys.exit(1)
 87 |     return opts
 88 | 
 89 | def get_resource_map(fname = 'data/instance.matrix.txt'):
 90 |     vcpu = {}
 91 |     vram = {}
 92 |     price = {}
 93 |     for l in open(fname):
 94 |         if len(l.strip()) == 0:
 95 |             continue
 96 |         arr = l.split('\t')
 97 |         if len(arr) != 0:
 98 |             vcpu[arr[0]] = int(arr[1])
 99 |             vram[arr[0]] = int(float(arr[3]) * 1024)
100 |             price[arr[0]] = float(arr[5].split()[0].strip('$'))
101 |     return vcpu, vram, price
102 | 
103 | #
104 | # get user data of specific instance
105 | #
106 | def get_user_data(fname, master_dns, instance_type, include_aws_key):
107 |     vcpu, vram, price = get_resource_map()
108 |     data = open(fname).readlines()
109 |     ret = []
110 |     if include_aws_key:
111 |         print "include AWS key option is switched on..."
112 | 
113 |     for l in data:
114 |         special = True
115 |         if l.startswith('MASTER ='):
116 |             ret.append('MASTER = \'%s\'\n' % master_dns)
117 |         elif l.startswith('NODE_TYPE ='):
118 |             ret.append('NODE_TYPE = \'%s\'\n' % instance_type)
119 |         elif l.startswith('NODE_VMEM ='):
120 |             ret.append('NODE_VMEM = %d\n' % vram[instance_type])
121 |         elif l.startswith('NODE_VCPU ='):
122 |             ret.append('NODE_VCPU = %d\n' % vcpu[instance_type])
123 |         elif l.startswith('AWS_KEY =') and include_aws_key:
124 |             ret.append('AWS_KEY = \'%s\'\n' % os.getenv('AWS_SECRET_ACCESS_KEY', 'undefined'))
125 |         elif l.startswith('AWS_ID =') and include_aws_key:
126 |             ret.append('AWS_ID = \'%s\'\n' % os.getenv('AWS_ACCESS_KEY_ID', 'undefined'))
127 |         else:
128 |             ret.append(l)
129 |             special = False
130 |     udata = ''.join(ret)
131 |     return udata
132 | 
133 | # get ami of the machine
134 | # use ubuntu machines
135 | def get_ami(instance):
136 |     itype = ec2_util.get_instance_type(instance)
137 |     if itype == 'pvm':
138 |         return 'ami-6989a659'
139 |     else:
140 |         return 'ami-5189a661'
141 | 
142 | # Launch master of a cluster of the given name, by setting up its security groups,
143 | # and then starting new instances in them.
144 | # Returns a tuple of EC2 reservation objects for the master and slaves
145 | # Fails if there already instances running in the cluster's groups.
146 | def launch_master(conn, opts):
147 |     cluster_name = opts.cluster_name
148 |     if opts.identity_file is None:
149 |         print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
150 |         sys.exit(1)
151 |     if opts.key_pair is None:
152 |         print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
153 |         sys.exit(1)
154 | 
155 |     print "Setting up security groups..."
156 |     master_group = ec2_util.get_or_make_group(conn, cluster_name + "-master")
157 |     slave_group = ec2_util.get_or_make_group(conn, cluster_name + "-slave")
158 |     if master_group.rules == []:  # Group was just now created
159 |         master_group.authorize(src_group=master_group)
160 |         master_group.authorize(src_group=slave_group)
161 |         master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
162 |         master_group.authorize('tcp', 8000, 8100, '0.0.0.0/0')
163 |         master_group.authorize('tcp', 9000, 9999, '0.0.0.0/0')
164 |         master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
165 |         master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
166 |         master_group.authorize('tcp', 50000, 50100, '0.0.0.0/0')
167 |         master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
168 |         master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
169 |         master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
170 |         master_group.authorize('udp', 0, 65535, '0.0.0.0/0')
171 |     if slave_group.rules == []:  # Group was just now created
172 |         slave_group.authorize(src_group=master_group)
173 |         slave_group.authorize(src_group=slave_group)
174 |         slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
175 |         slave_group.authorize('tcp', 8000, 8100, '0.0.0.0/0')
176 |         slave_group.authorize('tcp', 9000, 9999, '0.0.0.0/0')
177 |         slave_group.authorize('tcp', 50000, 50100, '0.0.0.0/0')
178 |         slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
179 |         slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
180 |         slave_group.authorize('udp', 0, 65535, '0.0.0.0/0')
181 | 
182 |     # Check if instances are already running in our groups
183 |     existing_masters, existing_slaves = ec2_util.get_existing_cluster(conn, cluster_name,
184 |                                                                       die_on_error=False)
185 |     if existing_slaves:
186 |         print >> stderr, ("ERROR: There are already instances running in " +
187 |                           "group %s or %s" % (group.name, slave_group.name))
188 |         sys.exit(1)
189 | 
190 |     if opts.ami is None:
191 |         opts.ami = get_ami(opts.instance_type)
192 |     print "Launching instances..."
193 | 
194 |     try:
195 |         image = conn.get_all_images(image_ids=[opts.ami])[0]
196 |     except:
197 |         print >> stderr, "Could not find AMI " + opts.ami
198 |         sys.exit(1)
199 | 
200 |     # Launch or resume masters
201 |     if existing_masters:
202 |         print "Starting master..."
203 |         for inst in existing_masters:
204 |             if inst.state not in ["shutting-down", "terminated"]:
205 |                 inst.start()
206 |         master_nodes = existing_masters
207 |     else:
208 |         # Create block device mapping so that we can add an EBS volume if asked to
209 |         block_map = ec2_util.get_block_device(opts.instance_type, 0)
210 |         master_type = opts.instance_type
211 |         if opts.zone == 'all':
212 |             opts.zone = random.choice(conn.get_all_zones()).name
213 |         master_res = image.run(key_name=opts.key_pair,
214 |                                security_groups=[master_group],
215 |                                instance_type=master_type,
216 |                                placement=opts.zone,
217 |                                min_count=1,
218 |                                max_count=1,
219 |                                block_device_map=block_map,
220 |                                user_data=get_user_data('bootstrap.py', '',
221 |                                                        master_type, opts.include_aws_key))
222 |         master_nodes = master_res.instances
223 |         print "Launched master in %s, regid = %s" % (opts.zone, master_res.id)
224 | 
225 |     print 'Waiting for master to getup...'
226 |     ec2_util.wait_for_instances(conn, master_nodes)
227 | 
228 |     # Give the instances descriptive names
229 |     for master in master_nodes:
230 |         master.add_tag(
231 |             key='Name',
232 |             value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
233 |     master = master_nodes[0].public_dns_name
234 |     print 'finishing getting master %s' % master
235 |     # Return all the instances
236 |     return master_nodes
237 | 
238 | # Launch slaves of a cluster of the given name, by setting up its security groups,
239 | # and then starting new instances in them.
240 | # Returns a tuple of EC2 reservation objects for the master and slaves
241 | # Fails if there already instances running in the cluster's groups.
242 | def launch_slaves(conn, opts):
243 |     cluster_name = opts.cluster_name
244 |     if opts.identity_file is None:
245 |         print >> sys.stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
246 |         sys.exit(1)
247 |     if opts.key_pair is None:
248 |         print >> sys.stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
249 |         sys.exit(1)
250 |     master_group = ec2_util.get_or_make_group(conn, cluster_name + "-master", False)
251 |     slave_group = ec2_util.get_or_make_group(conn, cluster_name + "-slave", False)
252 |     # Check if instances are already running in our groups
253 |     existing_masters, existing_slaves = ec2_util.get_existing_cluster(conn, cluster_name,
254 |                                                                       die_on_error=False)
255 |     if len(existing_masters) == 0:
256 |         print >> stderr, ("ERROR: Cannot find master machine on group" +
257 |                           "group %s" % (master_group.name))
258 |         sys.exit(1)
259 | 
260 |     if opts.ami is None:
261 |         opts.ami = get_ami(opts.instance_type)
262 |     print "Launching instances..."
263 | 
264 |     try:
265 |         image = conn.get_all_images(image_ids=[opts.ami])[0]
266 |     except:
267 |         print >> stderr, "Could not find AMI " + opts.ami
268 |         sys.exit(1)
269 | 
270 |     master = existing_masters[0]
271 |     block_map = ec2_util.get_block_device(opts.instance_type, 0)
272 |     zone = master.placement
273 |     slave_res = image.run(key_name=opts.key_pair,
274 |                           security_groups=[slave_group],
275 |                           instance_type=opts.instance_type,
276 |                           placement=zone,
277 |                           min_count=opts.slaves,
278 |                           max_count=opts.slaves,
279 |                           block_device_map=block_map,
280 |                           user_data=get_user_data('bootstrap.py',
281 |                                                   master.private_dns_name,
282 |                                                   opts.instance_type,
283 |                                                   opts.include_aws_key))
284 |     slave_nodes = slave_res.instances
285 |     print "Launched %d slaves in %s, regid = %s" % (len(slave_nodes),
286 |                                                     zone, slave_res.id)
287 |     print 'Waiting for slave to getup...'
288 |     ec2_util.wait_for_instances(conn, slave_nodes)
289 |     for slave in slave_nodes:
290 |         slave.add_tag(
291 |             key='Name',
292 |             value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
293 |     print 'Done...'
294 | 
295 | # Launch slaves of a cluster of the given name, by setting up its security groups,
296 | # and then starting new instances in them.
297 | # Returns a tuple of EC2 reservation objects for the master and slaves
298 | # Fails if there already instances running in the cluster's groups.
299 | def launch_spot_slaves(conn, opts):
300 |     vcpu, vram, price = get_resource_map()
301 |     cluster_name = opts.cluster_name
302 |     if opts.identity_file is None:
303 |         print >> sys.stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
304 |         sys.exit(1)
305 |     if opts.spot_price is None:
306 |         opts.spot_price = price[opts.instance_type]
307 |         print "Spot price is not specified, bid the full price=%g for %s" % (opts.spot_price, opts.instance_type)
308 | 
309 |     if opts.key_pair is None:
310 |         print >> sys.stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
311 |         sys.exit(1)
312 | 
313 |     master_group = ec2_util.get_or_make_group(conn, cluster_name + "-master", False)
314 |     slave_group = ec2_util.get_or_make_group(conn, cluster_name + "-slave", False)
315 |     # Check if instances are already running in our groups
316 |     existing_masters, existing_slaves = ec2_util.get_existing_cluster(conn, cluster_name,
317 |                                                                       die_on_error=False)
318 |     if len(existing_masters) == 0:
319 |         print >> stderr, ("ERROR: Cannot find master machine on group" +
320 |                           "group %s" % (master_group.name))
321 |         sys.exit(1)
322 | 
323 |     if opts.ami is None:
324 |         opts.ami = get_ami(opts.instance_type)
325 |     print "Launching Spot instances type=%s, price=%g..." % (opts.instance_type, opts.spot_price)
326 | 
327 |     master = existing_masters[0]
328 |     block_map = ec2_util.get_block_device(opts.instance_type, 0)
329 |     zone = master.placement
330 |     slave_reqs = conn.request_spot_instances(
331 |         price=opts.spot_price,
332 |         image_id=opts.ami,
333 |         launch_group="launch-group-%s" % cluster_name,
334 |         placement=zone,
335 |         count=opts.slaves,
336 |         key_name=opts.key_pair,
337 |         security_groups=[slave_group],
338 |         instance_type=opts.instance_type,
339 |         block_device_map=block_map,
340 |         user_data=get_user_data('bootstrap.py',
341 |                                 master.private_dns_name,
342 |                                 opts.instance_type,
343 |                                 opts.include_aws_key))
344 |     print 'Done... request is submitted'
345 | 
346 | def stringify_command(parts):
347 |     if isinstance(parts, str):
348 |         return parts
349 |     else:
350 |         return ' '.join(map(pipes.quote, parts))
351 | 
352 | def ssh_args(opts):
353 |     parts = ['-o', 'StrictHostKeyChecking=no']
354 |     if opts.identity_file is not None:
355 |         parts += ['-i', opts.identity_file]
356 |     return parts
357 | 
358 | def ssh_command(opts):
359 |     return ['ssh'] + ssh_args(opts)
360 | 
361 | # Run a command on a host through ssh, retrying up to five times
362 | # and then throwing an exception if ssh continues to fail.
363 | def ssh(host, opts, command):
364 |     tries = 0
365 |     while True:
366 |         try:
367 |             return subprocess.check_call(
368 |                 ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host),
369 |                                      stringify_command(command)])
370 |         except subprocess.CalledProcessError as e:
371 |             if (tries > 5):
372 |                 # If this was an ssh failure, provide the user with hints.
373 |                 if e.returncode == 255:
374 |                     raise UsageError(
375 |                         "Failed to SSH to remote host {0}.\n" +
376 |                         "Please check that you have provided the correct --identity-file and " +
377 |                         "--key-pair parameters and try again.".format(host))
378 |                 else:
379 |                     raise e
380 |             print >> sys.stderr, \
381 |                 "Error executing remote command, retrying after 30 seconds: {0}".format(e)
382 |             time.sleep(30)
383 |             tries = tries + 1
384 | 
385 | def _check_output(*popenargs, **kwargs):
386 |     if 'stdout' in kwargs:
387 |         raise ValueError('stdout argument not allowed, it will be overridden.')
388 |     process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
389 |     output, unused_err = process.communicate()
390 |     retcode = process.poll()
391 |     if retcode:
392 |         cmd = kwargs.get("args")
393 |         if cmd is None:
394 |             cmd = popenargs[0]
395 |         raise subprocess.CalledProcessError(retcode, cmd, output=output)
396 |     return output
397 | 
398 | def main():
399 |     logging.basicConfig()
400 |     opts = parse_args()
401 |     try:
402 |         conn = ec2.connect_to_region(opts.region)
403 |     except Exception as e:
404 |         print >> sys.stderr, (e)
405 |         sys.exit(1)
406 | 
407 |     if opts.zone == '':
408 |         opts.zone = random.choice(conn.get_all_zones()).name
409 | 
410 |     action = opts.action
411 |     cluster_name = opts.cluster_name
412 | 
413 |     if action == 'launch':
414 |         master_nodes = launch_master(conn, opts)
415 |     elif action == 'addslave':
416 |         master_nodes = launch_slaves(conn, opts)
417 |     elif action == 'addspot':
418 |         master_nodes = launch_spot_slaves(conn, opts)
419 |     elif action == "get-master":
420 |         (master_nodes, slave_nodes) = ec2_util.get_existing_cluster(conn, cluster_name)
421 |         print master_nodes[0].public_dns_name
422 |     elif action == "login":
423 |         (master_nodes, slave_nodes) = ec2_util.get_existing_cluster(conn, cluster_name)
424 |         master = master_nodes[0].public_dns_name
425 |         subprocess.check_call(
426 |             ssh_command(opts)  + ['-t', "%s@%s" % (opts.user, master)])
427 |     elif action == "forward-port":
428 |         (master_nodes, slave_nodes) = ec2_util.get_existing_cluster(conn, cluster_name)
429 |         master = master_nodes[0].public_dns_name
430 |         subprocess.check_call(
431 |             ssh_command(opts)  + ['-D', '9595'] + ['-t', "%s@%s" % (opts.user, master)])
432 |     else:
433 |         print >> sys.stderr, "Invalid action: %s" % action
434 |         sys.exit(1)
435 | 
436 | 
437 | if __name__ == "__main__":
438 |     main()
439 | 


--------------------------------------------------------------------------------