├── requirements.txt ├── .gitignore ├── ami-packer ├── ansible │ ├── roles │ │ ├── cloudinit │ │ │ ├── templates │ │ │ │ ├── 10_bootcmd.cfg │ │ │ │ └── configure-mdraid.sh │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── spark │ │ │ ├── templates │ │ │ │ ├── spark-alias.sh.j2 │ │ │ │ ├── spark-master.j2 │ │ │ │ ├── spark-worker.j2 │ │ │ │ └── spark-env.sh.j2 │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── openjdk │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ ├── common │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ └── cdh5 │ │ │ ├── templates │ │ │ └── hadoop │ │ │ │ ├── core-site.xml.j2 │ │ │ │ └── log4j.properties.j2 │ │ │ └── tasks │ │ │ └── main.yml │ └── main.yml ├── scripts │ ├── cleanup.sh │ └── setup_ansible.sh └── packer.json ├── tests ├── test-spark-app │ ├── build.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── test │ │ └── app │ │ └── AddLotsOfNumbers.scala └── test-spark-submit-works-simple-jar.sh ├── README.md ├── autoscaling-demo.md └── spark-cloud.py /requirements.txt: -------------------------------------------------------------------------------- 1 | boto>=2.20.1 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | tests/test-spark-app/project* 3 | tests/test-spark-app/.idea* 4 | tests/test-spark-app/target* 5 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cloudinit/templates/10_bootcmd.cfg: -------------------------------------------------------------------------------- 1 | # commands to run on each boot 2 | bootcmd: 3 | - /etc/cloud/configure-mdraid.sh 4 | 5 | 6 | -------------------------------------------------------------------------------- /ami-packer/ansible/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | sudo: yes 4 | gather_facts: yes 5 | roles: 6 | - cloudinit 7 | - common 8 | - openjdk 9 | - cdh5 10 | - spark 11 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/spark/templates/spark-alias.sh.j2: -------------------------------------------------------------------------------- 1 | alias spark-shell="spark-shell --master `cat /etc/spark/conf/cluster-url`" 2 | alias spark-submit="spark-submit --master `cat /etc/spark/conf/cluster-url`" 3 | 4 | -------------------------------------------------------------------------------- /tests/test-spark-app/build.sbt: -------------------------------------------------------------------------------- 1 | val companyName = "app" 2 | 3 | val domain = "test" 4 | 5 | val projectName = "spark-cluster-launch-test" 6 | 7 | name := projectName 8 | 9 | scalaVersion := "2.10.4" 10 | 11 | val sparkVersion = "1.5.1" 12 | 13 | libraryDependencies ++= Seq( 14 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided" withSources() withJavadoc() 15 | ) 16 | 17 | organization := domain + "." + companyName -------------------------------------------------------------------------------- /ami-packer/ansible/roles/openjdk/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | - name: add openjdk ppa 2 | apt_repository: repo='ppa:openjdk-r/ppa' 3 | - name: add java repository key 4 | shell: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 86F44E2A 5 | 6 | - name: install or update openjdk via apt 7 | apt: pkg={{ item }} update_cache=yes 8 | with_items: 9 | - openjdk-8-jre-headless 10 | - openjdk-8-jdk 11 | tags: 12 | - java 13 | -------------------------------------------------------------------------------- /ami-packer/scripts/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get -y autoremove 4 | sudo apt-get -y clean 5 | 6 | echo "cleaning up dhcp leases" 7 | sudo rm /var/lib/dhcp/* 8 | 9 | echo "cleaning up udev rules" 10 | sudo rm -f /etc/udev/rules.d/70-persistent-net.rules 11 | sudo mkdir /etc/udev/rules.d/70-persistent-net.rules 12 | sudo rm -rf /dev/.udev/ 13 | sudo rm -f /lib/udev/rules.d/75-persistent-net-generator.rules 14 | 15 | 16 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cloudinit/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: install required packages 2 | apt: pkg={{ item }} 3 | with_items: 4 | - mdadm 5 | 6 | - name: copy instance store configuration script 7 | template: src={{ item }} dest=/etc/cloud/{{ item }} owner=root group=root mode=0755 8 | with_items: 9 | - configure-mdraid.sh 10 | 11 | - name: cloudinit bootcmd 12 | template: src=10_bootcmd.cfg dest=/etc/cloud/cloud.cfg.d/10_bootcmd.cfg owner=root group=root mode=0644 13 | 14 | -------------------------------------------------------------------------------- /ami-packer/scripts/setup_ansible.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # wait for things to settle 4 | sleep 30 5 | 6 | # do not try to start services after installing 7 | cat < 22 | (1 to doSomethingNum).map(_.toString.length) 23 | .sum) 24 | .reduce(_ + _) 25 | 26 | val pw = new java.io.PrintWriter(new File(fullPathToFile)) 27 | try pw.write(count.toString + "\n") finally pw.close() 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cloudinit/templates/configure-mdraid.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ### Setup instance stores with raid0 3 | 4 | NUM_DEVICES=`find /dev -name 'xvd[b-z]*' | wc -l` 5 | DEVICES=`find /dev -name 'xvd[b-z]*' -printf '%p\040'` 6 | 7 | mount -l | grep '/dev/md127' 8 | 9 | if [ $? -eq 1 ]; then 10 | echo "Mounting /dev/md127" 11 | 12 | for DEVICE in $DEVICES; do 13 | umount $DEVICE 14 | done 15 | 16 | 17 | yes | mdadm --create /dev/md127 --name=0 --level=0 -c256 --raid-devices=${NUM_DEVICES} --force $DEVICES 18 | echo "DEVICE $DEVICES" > /etc/mdadm.conf 19 | mdadm --detail --scan >> /etc/mdadm.conf 20 | 21 | blockdev --setra 65536 /dev/md127 22 | mkfs.ext4 /dev/md127 23 | mount -t ext4 -o noatime /dev/md127 /mnt 24 | mkdir /mnt/tmp 25 | chmod -R 777 /mnt 26 | chmod 1777 /mnt/tmp 27 | mount -o bind /mnt/tmp /tmp 28 | else 29 | echo "/dev/md127 already configured" 30 | fi 31 | 32 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cdh5/templates/hadoop/core-site.xml.j2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | fs.trash.interval 8 | 1440 9 | 10 | 11 | 12 | 13 | io.compression.codecs 14 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec, 15 | org.apache.hadoop.io.compress.BZip2Codec,com.hadoop.compression.lzo.LzoCodec, 16 | com.hadoop.compression.lzo.LzopCodec,org.apache.hadoop.io.compress.SnappyCodec 17 | 18 | 19 | io.compression.codec.lzo.class 20 | com.hadoop.compression.lzo.LzoCodec 21 | 22 | 23 | 24 | 25 | hadoop.proxyuser.oozie.hosts 26 | * 27 | 28 | 29 | hadoop.proxyuser.oozie.groups 30 | * 31 | 32 | 33 | 34 | 35 | hadoop.proxyuser.httpfs.hosts 36 | * 37 | 38 | 39 | hadoop.proxyuser.httpfs.groups 40 | * 41 | 42 | 43 | 44 | fs.s3n.multipart.uploads.enabled 45 | true 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cdh5/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: add Cloudera CDH5 key 2 | apt_key: url=http://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh/archive.key 3 | tags: 4 | - cdh5 5 | 6 | - name: add Cloudera CDH5 repository 7 | apt_repository: repo='deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh trusty-cdh5 contrib' 8 | tags: 9 | - cdh5 10 | 11 | - name: add Cloudera CDH5 GPL extras repository 12 | apt_repository: repo='deb [arch=amd64] http://archive.cloudera.com/gplextras5/ubuntu/trusty/amd64/gplextras trusty-gplextras5 contrib' 13 | tags: 14 | - cdh5 15 | 16 | - name: install hadoop base pkgs 17 | apt: name={{ item }} state=present 18 | with_items: 19 | - hadoop 20 | - hadoop-hdfs 21 | - hadoop-client 22 | - hadoop-lzo 23 | tags: 24 | - cdh5 25 | 26 | - name: create /etc/hadoop/conf.cluster 27 | file: path=/etc/hadoop/conf.cluster state=directory owner=root group=root mode=0755 28 | register: create_hadoop_conf 29 | tags: 30 | - cdh5 31 | 32 | - name: create alternatives for hadoop-conf 33 | shell: update-alternatives --install /etc/hadoop/conf hadoop-conf {{ item }} 50 34 | with_items: 35 | - /etc/hadoop/conf.cluster 36 | when: create_hadoop_conf|changed 37 | tags: 38 | - cdh5 39 | 40 | - name: create alternatives for hadoop-conf 41 | shell: update-alternatives --set hadoop-conf {{ item }} 42 | with_items: 43 | - /etc/hadoop/conf.cluster 44 | when: create_hadoop_conf|changed 45 | tags: 46 | - cdh5 47 | 48 | - name: copy the hadoop configuration files 49 | template: src=hadoop/{{ item }}.j2 dest=/etc/hadoop/conf.cluster/{{ item }} owner=hdfs group=hadoop mode=0664 50 | with_items: 51 | - core-site.xml 52 | - log4j.properties 53 | register: copy_hadoop_conf 54 | tags: 55 | - cdh5 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-cloud 2 | Spark-cloud is a set of scripts for starting spark clusters on ec2 3 | 4 | [![Join the chat at https://gitter.im/entropyltd/spark-cloud](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/entropyltd/spark-cloud?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 5 | 6 | [![Code Health](https://landscape.io/github/entropyltd/spark-cloud/master/landscape.svg?style=flat)](https://landscape.io/github/entropyltd/spark-cloud/master) 7 | 8 | # Warning 9 | spark-cloud is Alpha quality pre-relase software, you are using it at your own risk. 10 | Always check that your clusters have properly started/stopped. 11 | 12 | spark-cloud will currently only work in us-east AWS zone, support for other zones coming very soon! 13 | 14 | # Cluster Security 15 | Spark-cloud relies on ip level security for access to web UIs, you should specify `--authorized-address=your.ip.address/32` when running the real cluster. 16 | 17 | At the moment the auto-scaling group will start with 2 slaves, the minimum will be 2 slaves and the maximum 8. 18 | 19 | # Example usage 20 | 21 | ## To launch a cluster into VPC 22 | 23 | ``` 24 | # set credentials 25 | export AWS_ACCESS_KEY=.. 26 | export AWS_SECRET_ACCESS_KEY=... 27 | # start cluster 28 | ./spark-cloud.py -k keypair --vpc-id=vpc-XXXXX --subnet-id=subnet-XXXXXX --zone=us-east-1a launch sparkcluster1 29 | ``` 30 | 31 | ## To launch a cluster into EC2-classic 32 | ``` 33 | # set credentials 34 | export AWS_ACCESS_KEY=.. 35 | export AWS_SECRET_ACCESS_KEY=... 36 | # start cluster 37 | ./spark-cloud.py -k keypair --zone=us-east-1e launch spark-ec2classic 38 | ``` 39 | 40 | ## To ssh into your cluster and run the spark shell 41 | 42 | To ssh in 43 | 44 | ``` 45 | ssh -i path-to-keypair.pem ubuntu@master-host-which-is-helpfully-printed-at-launch 46 | ``` 47 | 48 | To run `spark-shell` you can't use `--master yarn-client`. 49 | The master URL will be of the form `spark://host:port` it can be found by opening up the spark UI (which is helpfully printed at launch time). 50 | 51 | 52 | To run spark-shell just: 53 | ``` 54 | spark-shell 55 | ``` 56 | 57 | # Termination 58 | 59 | Has a couple of issues in case it does not work just rerun the "destroy" command. 60 | -------------------------------------------------------------------------------- /tests/test-spark-submit-works-simple-jar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | key_pair_path=$1 6 | key_pair=$2 7 | 8 | cluster_name=spark-ec2classic-test 9 | jar_name=test-spark-app.jar 10 | local_jar_path=test-spark-app/target/scala-2.10/spark-cluster-launch-test_2.10-0.1-SNAPSHOT.jar 11 | test_log_path=/tmp/spark-cloud-test.log 12 | 13 | user=ubuntu 14 | home_dir=/home/${user} 15 | 16 | job_result=job-result.txt 17 | 18 | working_directory=`pwd` 19 | 20 | correct_value="-1696934592" 21 | 22 | ssh_args="-o StrictHostKeyChecking=no -i ${key_pair_path}" 23 | 24 | script_path="../spark-cloud.py" 25 | 26 | if [ "${key_pair_path}" = "" ]; then 27 | echo "ERROR: please supply the key pair path as the first arg to this script" 28 | exit 1 29 | fi 30 | 31 | if [ "${key_pair}" = "" ]; then 32 | echo "ERROR: please supply the key pair as the second arg to this script" 33 | exit 1 34 | fi 35 | 36 | # TODO take a look at s3 config or aws config files to set automatically 37 | 38 | if [ "${AWS_SECRET_ACCESS_KEY}" = "" ]; then 39 | echo "ERROR: AWS_SECRET_ACCESS_KEY not set" 40 | exit 1 41 | fi 42 | 43 | if [ "${AWS_ACCESS_KEY}" = "" ]; then 44 | echo "ERROR: AWS_ACCESS_KEY not set" 45 | exit 1 46 | fi 47 | 48 | function extract-master-node-from-log { 49 | master=`cat ${test_log_path} | grep -o "ec2.*compute[\-]*[0-9]*\.amazonaws\.com" | head -1` 50 | if [ "${master}" = "" ]; then 51 | echo "ERROR: Did not find master node" 52 | exit 1 53 | fi 54 | echo ${master} 55 | } 56 | 57 | function create-cluster { 58 | ${script_path} -k ${key_pair} --zone=us-east-1e --max-spot-price=0.02 --min-instances=1 --max-instances=3 launch ${cluster_name} | tee ${test_log_path} 59 | } 60 | 61 | function build-simple-spark-app { 62 | cd ${working_directory}/test-spark-app 63 | sbt package 64 | cd ${working_directory} 65 | } 66 | 67 | function spark-submit-simple-app { 68 | echo "INFO: Copying jar" 69 | scp ${ssh_args} ${local_jar_path} ${user}@$1:${home_dir}/${jar_name} 70 | 71 | echo "INFO: Getting spark master URL" 72 | host=`ssh ${ssh_args} ${user}@$1 "hostname"` 73 | spark_master=spark://${host}.ec2.internal:7077 74 | 75 | echo "INFO: Spark master url: $spark_master" 76 | 77 | echo "INFO: Running spark-submit" 78 | ssh ${ssh_args} ${user}@$1 "spark-submit --master $spark_master --class test.app.AddLotsOfNumbers ${jar_name} ${home_dir}/${job_result}" 79 | } 80 | 81 | # TODO Add more tests, like: 82 | # to force it to scale 83 | # curl the UI, etc 84 | function check-output-of-job { 85 | scp ${ssh_args} ${user}@$1:${home_dir}/${job_result} /tmp/ 86 | value=`cat /tmp/${job_result}` 87 | 88 | if [ "${value}" = "${correct_value}" ]; then 89 | echo "INFO: Test passed!!!" 90 | else 91 | echo "ERROR: Test failed, expected ${correct_value} but got ${value}" 92 | exit 1 93 | fi 94 | } 95 | 96 | function destroy-cluster { 97 | echo "INFO: Destroying cluster" 98 | ${script_path} destroy spark-ec2classic-test 99 | } 100 | 101 | trap destroy-cluster EXIT 102 | 103 | create-cluster 104 | master=`extract-master-node-from-log` 105 | 106 | echo "INFO: master: $master" 107 | 108 | build-simple-spark-app 109 | 110 | spark-submit-simple-app ${master} 111 | 112 | check-output-of-job ${master} 113 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/spark/templates/spark-master.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Starts a Spark master 19 | # 20 | # chkconfig: 2345 86 14 21 | # description: Spark master 22 | # 23 | ### BEGIN INIT INFO 24 | # Provides: spark-master 25 | # Short-Description: Spark master 26 | # Default-Start: 2 3 4 5 27 | # Default-Stop: 0 1 6 28 | # Required-Start: $syslog $remote_fs 29 | # Required-Stop: $syslog $remote_fs 30 | # Should-Start: 31 | # Should-Stop: 32 | ### END INIT INFO 33 | 34 | CONF_DIR="/etc/spark/conf" 35 | if [ -f $CONF_DIR/spark-env.sh ]; then 36 | . $CONF_DIR/spark-env.sh 37 | fi 38 | echo "spark://${STANDALONE_SPARK_MASTER_HOST}:7077" > /etc/spark/conf/cluster-url 39 | if [ ! -z "$SPARK_MASTER" ]; then 40 | echo "Not a master, not starting" 41 | exit 0 42 | fi 43 | 44 | . /lib/lsb/init-functions 45 | BIGTOP_DEFAULTS_DIR=${BIGTOP_DEFAULTS_DIR-/etc/default} 46 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/hadoop ] && . ${BIGTOP_DEFAULTS_DIR}/hadoop 47 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/spark-master ] && . ${BIGTOP_DEFAULTS_DIR}/spark-master 48 | 49 | # Autodetect JAVA_HOME if not defined 50 | . /usr/lib/bigtop-utils/bigtop-detect-javahome 51 | 52 | RETVAL_SUCCESS=0 53 | 54 | STATUS_RUNNING=0 55 | STATUS_DEAD=1 56 | STATUS_DEAD_AND_LOCK=2 57 | STATUS_NOT_RUNNING=3 58 | STATUS_OTHER_ERROR=102 59 | 60 | 61 | ERROR_PROGRAM_NOT_INSTALLED=5 62 | ERROR_PROGRAM_NOT_CONFIGURED=6 63 | 64 | 65 | RETVAL=0 66 | SLEEP_TIME=5 67 | PROC_NAME="java" 68 | 69 | DAEMON="spark-master" 70 | DESC="Spark master" 71 | EXEC_PATH="/usr/lib/spark/bin/spark-class" 72 | EXEC_DIR="" 73 | SVC_USER="spark" 74 | DAEMON_FLAGS="" 75 | CONF_DIR="/etc/spark/conf" 76 | PIDFILE="/var/run/spark/spark-master.pid" 77 | LOCKDIR="/var/lock/subsys" 78 | LOCKFILE="$LOCKDIR/spark-master" 79 | WORKING_DIR="/var/lib/spark" 80 | 81 | install -d -m 0755 -o spark -g spark /var/run/spark 1>/dev/null 2>&1 || : 82 | [ -d "$LOCKDIR" ] || install -d -m 0755 $LOCKDIR 1>/dev/null 2>&1 || : 83 | start() { 84 | [ -x $EXE_FILE ] || exit $ERROR_PROGRAM_NOT_INSTALLED 85 | log_success_msg "Starting $DESC (${DAEMON}): " 86 | 87 | checkstatusofproc 88 | status=$? 89 | if [ "$status" -eq "$STATUS_RUNNING" ]; then 90 | log_success_msg "${DESC} is running" 91 | exit 0 92 | fi 93 | 94 | LOG_FILE=/var/log/spark/${DAEMON}.out 95 | 96 | su -s /bin/bash $SVC_USER -c "nohup nice -n 0 \ 97 | ${EXEC_PATH} org.apache.spark.deploy.master.Master $DAEMON_FLAGS \ 98 | > $LOG_FILE 2>&1 & "'echo $!' > "$PIDFILE" 99 | 100 | sleep 3 101 | 102 | checkstatusofproc 103 | RETVAL=$? 104 | [ $RETVAL -eq $STATUS_RUNNING ] && touch $LOCKFILE 105 | return $RETVAL 106 | } 107 | stop() { 108 | log_success_msg "Stopping $DESC (${DAEMON}): " 109 | killproc -p $PIDFILE java 110 | RETVAL=$? 111 | 112 | [ $RETVAL -eq $RETVAL_SUCCESS ] && rm -f $LOCKFILE $PIDFILE 113 | return $RETVAL 114 | } 115 | restart() { 116 | stop 117 | start 118 | } 119 | 120 | checkstatusofproc(){ 121 | pidofproc -p $PIDFILE $PROC_NAME > /dev/null 122 | } 123 | 124 | checkstatus(){ 125 | checkstatusofproc 126 | status=$? 127 | 128 | case "$status" in 129 | $STATUS_RUNNING) 130 | log_success_msg "${DESC} is running" 131 | ;; 132 | $STATUS_DEAD) 133 | log_failure_msg "${DESC} is dead and pid file exists" 134 | ;; 135 | $STATUS_DEAD_AND_LOCK) 136 | log_failure_msg "${DESC} is dead and lock file exists" 137 | ;; 138 | $STATUS_NOT_RUNNING) 139 | log_failure_msg "${DESC} is not running" 140 | ;; 141 | *) 142 | log_failure_msg "${DESC} status is unknown" 143 | ;; 144 | esac 145 | return $status 146 | } 147 | 148 | condrestart(){ 149 | [ -e $LOCKFILE ] && restart || : 150 | } 151 | 152 | check_for_root() { 153 | if [ $(id -ur) -ne 0 ]; then 154 | echo 'Error: root user required' 155 | echo 156 | exit 1 157 | fi 158 | } 159 | 160 | service() { 161 | case "$1" in 162 | start) 163 | check_for_root 164 | start 165 | ;; 166 | stop) 167 | check_for_root 168 | stop 169 | ;; 170 | status) 171 | checkstatus 172 | RETVAL=$? 173 | ;; 174 | restart) 175 | check_for_root 176 | restart 177 | ;; 178 | condrestart|try-restart) 179 | check_for_root 180 | condrestart 181 | ;; 182 | *) 183 | echo $"Usage: $0 {start|stop|status|restart|try-restart|condrestart}" 184 | exit 1 185 | esac 186 | } 187 | 188 | service "$@" 189 | 190 | exit $RETVAL 191 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/spark/templates/spark-worker.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Starts a Spark worker 19 | # 20 | # chkconfig: 2345 87 13 21 | # description: Spark worker 22 | # 23 | ### BEGIN INIT INFO 24 | # Provides: spark-worker 25 | # Short-Description: Spark worker 26 | # Default-Start: 2 3 4 5 27 | # Default-Stop: 0 1 6 28 | # Required-Start: $syslog $remote_fs 29 | # Required-Stop: $syslog $remote_fs 30 | # Should-Start: 31 | # Should-Stop: 32 | ### END INIT INFO 33 | 34 | CONF_DIR="/etc/spark/conf" 35 | if [ -f $CONF_DIR/spark-env.sh ]; then 36 | . $CONF_DIR/spark-env.sh 37 | fi 38 | echo "spark://${STANDALONE_SPARK_MASTER_HOST}:7077" > /etc/spark/conf/cluster-url 39 | if [ -z "$SPARK_MASTER" ]; then 40 | echo "I'm a master, worker not starting" 41 | exit 0 42 | fi 43 | 44 | . /lib/lsb/init-functions 45 | BIGTOP_DEFAULTS_DIR=${BIGTOP_DEFAULTS_DIR-/etc/default} 46 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/hadoop ] && . ${BIGTOP_DEFAULTS_DIR}/hadoop 47 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/spark-worker ] && . ${BIGTOP_DEFAULTS_DIR}/spark-worker 48 | 49 | # Autodetect JAVA_HOME if not defined 50 | . /usr/lib/bigtop-utils/bigtop-detect-javahome 51 | 52 | RETVAL_SUCCESS=0 53 | 54 | STATUS_RUNNING=0 55 | STATUS_DEAD=1 56 | STATUS_DEAD_AND_LOCK=2 57 | STATUS_NOT_RUNNING=3 58 | STATUS_OTHER_ERROR=102 59 | 60 | 61 | ERROR_PROGRAM_NOT_INSTALLED=5 62 | ERROR_PROGRAM_NOT_CONFIGURED=6 63 | 64 | 65 | RETVAL=0 66 | SLEEP_TIME=5 67 | PROC_NAME="java" 68 | 69 | DAEMON="spark-worker" 70 | DESC="Spark worker" 71 | EXEC_PATH="/usr/lib/spark/bin/spark-class" 72 | EXEC_DIR="" 73 | SVC_USER="spark" 74 | DAEMON_FLAGS="" 75 | CONF_DIR="/etc/spark/conf" 76 | PIDFILE="/var/run/spark/spark-worker.pid" 77 | LOCKDIR="/var/lock/subsys" 78 | LOCKFILE="$LOCKDIR/spark-worker" 79 | WORKING_DIR="/var/lib/spark" 80 | 81 | install -d -m 0755 -o spark -g spark /var/run/spark 1>/dev/null 2>&1 || : 82 | [ -d "$LOCKDIR" ] || install -d -m 0755 $LOCKDIR 1>/dev/null 2>&1 || : 83 | start() { 84 | [ -x $EXE_FILE ] || exit $ERROR_PROGRAM_NOT_INSTALLED 85 | log_success_msg "Starting $DESC (${DAEMON}): " 86 | 87 | checkstatusofproc 88 | status=$? 89 | if [ "$status" -eq "$STATUS_RUNNING" ]; then 90 | log_success_msg "${DESC} is running" 91 | exit 0 92 | fi 93 | 94 | LOG_FILE=/var/log/spark/${DAEMON}.out 95 | 96 | if [ -f $CONF_DIR/spark-env.sh ]; then 97 | . $CONF_DIR/spark-env.sh 98 | fi 99 | 100 | su -s /bin/bash $SVC_USER -c "nohup nice -n 0 \ 101 | ${EXEC_PATH} org.apache.spark.deploy.worker.Worker spark://$STANDALONE_SPARK_MASTER_HOST:$SPARK_MASTER_PORT $DAEMON_FLAGS \ 102 | > $LOG_FILE 2>&1 & "'echo $!' > "$PIDFILE" 103 | 104 | sleep 3 105 | 106 | checkstatusofproc 107 | RETVAL=$? 108 | [ $RETVAL -eq $STATUS_RUNNING ] && touch $LOCKFILE 109 | return $RETVAL 110 | } 111 | stop() { 112 | log_success_msg "Stopping $DESC (${DAEMON}): " 113 | killproc -p $PIDFILE java 114 | RETVAL=$? 115 | 116 | [ $RETVAL -eq $RETVAL_SUCCESS ] && rm -f $LOCKFILE $PIDFILE 117 | return $RETVAL 118 | } 119 | restart() { 120 | stop 121 | start 122 | } 123 | 124 | checkstatusofproc(){ 125 | pidofproc -p $PIDFILE $PROC_NAME > /dev/null 126 | } 127 | 128 | checkstatus(){ 129 | checkstatusofproc 130 | status=$? 131 | 132 | case "$status" in 133 | $STATUS_RUNNING) 134 | log_success_msg "${DESC} is running" 135 | ;; 136 | $STATUS_DEAD) 137 | log_failure_msg "${DESC} is dead and pid file exists" 138 | ;; 139 | $STATUS_DEAD_AND_LOCK) 140 | log_failure_msg "${DESC} is dead and lock file exists" 141 | ;; 142 | $STATUS_NOT_RUNNING) 143 | log_failure_msg "${DESC} is not running" 144 | ;; 145 | *) 146 | log_failure_msg "${DESC} status is unknown" 147 | ;; 148 | esac 149 | return $status 150 | } 151 | 152 | condrestart(){ 153 | [ -e $LOCKFILE ] && restart || : 154 | } 155 | 156 | check_for_root() { 157 | if [ $(id -ur) -ne 0 ]; then 158 | echo 'Error: root user required' 159 | echo 160 | exit 1 161 | fi 162 | } 163 | 164 | service() { 165 | case "$1" in 166 | start) 167 | check_for_root 168 | start 169 | ;; 170 | stop) 171 | check_for_root 172 | stop 173 | ;; 174 | status) 175 | checkstatus 176 | RETVAL=$? 177 | ;; 178 | restart) 179 | check_for_root 180 | restart 181 | ;; 182 | condrestart|try-restart) 183 | check_for_root 184 | condrestart 185 | ;; 186 | *) 187 | echo $"Usage: $0 {start|stop|status|restart|try-restart|condrestart}" 188 | exit 1 189 | esac 190 | } 191 | 192 | service "$@" 193 | 194 | exit $RETVAL 195 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/spark/templates/spark-env.sh.j2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This file is sourced when running various Spark programs. 4 | # Copy it as spark-env.sh and edit that to configure Spark for your site. 5 | 6 | # Options read when launching programs locally with 7 | # ./bin/run-example or ./bin/spark-submit 8 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 9 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 10 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program 11 | # - SPARK_CLASSPATH, default classpath entries to append 12 | 13 | # Options read by executors and drivers running inside the cluster 14 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 15 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program 16 | # - SPARK_CLASSPATH, default classpath entries to append 17 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data 18 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos 19 | 20 | # Options read in YARN client mode 21 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 22 | # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) 23 | # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). 24 | # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) 25 | # - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 1G) 26 | # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) 27 | # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’) 28 | # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. 29 | # - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. 30 | 31 | # Options for the daemons used in the standalone deploy mode 32 | # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname 33 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master 34 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") 35 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine 36 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) 37 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker 38 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node 39 | # - SPARK_WORKER_DIR, to set the working directory of worker processes 40 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") 41 | # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). 42 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") 43 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") 44 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") 45 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers 46 | 47 | # Generic options for the daemons used in the standalone deploy mode 48 | # - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) 49 | # - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) 50 | # - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) 51 | # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) 52 | # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) 53 | 54 | ### 55 | ### === IMPORTANT === 56 | ### Change the following to specify a real cluster's Master host 57 | ### 58 | TMP=`tempfile` 59 | curl -f -s http://169.254.169.254/latest/user-data > $TMP || true 60 | chmod a+rx $TMP 61 | . $TMP 62 | rm -f $TMP 63 | PUBLIC_HOSTNAME=`curl -s http://169.254.169.254/latest/meta-data/hostname` 64 | 65 | if [ -z "$SPARK_MASTER" ] 66 | then 67 | export STANDALONE_SPARK_MASTER_HOST=$PUBLIC_HOSTNAME 68 | else 69 | export STANDALONE_SPARK_MASTER_HOST=$SPARK_MASTER 70 | fi 71 | 72 | export SPARK_MASTER_IP=$STANDALONE_SPARK_MASTER_HOST 73 | export SPARK_MASTER_HOST=$STANDALONE_SPARK_MASTER_HOST 74 | export SPARK_LOCAL_IP=$PUBLIC_HOSTNAME 75 | export SPARK_PUBLIC_DNS=$PUBLIC_HOSTNAME 76 | 77 | ### Let's run everything with JVM runtime, instead of Scala 78 | export SPARK_LAUNCH_WITH_SCALA=0 79 | export SPARK_LIBRARY_PATH=${SPARK_HOME}/lib 80 | export SPARK_MASTER_WEBUI_PORT=18080 81 | export SPARK_MASTER_PORT=7077 82 | export SPARK_WORKER_PORT=7078 83 | export SPARK_WORKER_WEBUI_PORT=18081 84 | export SPARK_WORKER_DIR=/var/run/spark/work 85 | export SPARK_LOG_DIR=/var/log/spark 86 | export SPARK_PID_DIR='/var/run/spark/' 87 | 88 | if [ -n "$HADOOP_HOME" ]; then 89 | export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/libfakeroot:/usr/lib64/libfakeroot:/usr/lib32/libfakeroot:/usr/lib/hadoop/lib/native 90 | fi 91 | 92 | if [ -d "/usr/lib/hadoop/lib/native" ]; then 93 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/hadoop/lib/native 94 | fi 95 | 96 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} 97 | 98 | if [[ -d $SPARK_HOME/python ]] 99 | then 100 | for i in 101 | do 102 | SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:$i 103 | done 104 | fi 105 | 106 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:$SPARK_LIBRARY_PATH/spark-assembly.jar" 107 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:" 108 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop/lib/*" 109 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop/*" 110 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-hdfs/lib/*" 111 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-hdfs/*" 112 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-mapreduce/lib/*" 113 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-mapreduce/*" 114 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-yarn/lib/*" 115 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-yarn/*" 116 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hive/lib/*" 117 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/flume-ng/lib/*" 118 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/paquet/lib/*" 119 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/avro/lib/*" 120 | -------------------------------------------------------------------------------- /autoscaling-demo.md: -------------------------------------------------------------------------------- 1 | 2 | # Autoscaling Demo 3 | 4 | (This demo assumes default puny instance size) 5 | 6 | Once in the shell try running some dummy code to consume the resources, e.g. 7 | 8 | ``` 9 | sc.makeRDD(1 to 1000000).repartition(400).map(_ => (1 to 20000).map(_.toString.length).reduce(_ + _)).reduce(_ + _) 10 | ``` 11 | 12 | You should observe the rate of log entries is around 2 - 5 seconds, e.g. 13 | 14 | ``` 15 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Starting task 15.0 in stage 1.0 (TID 18, 172.31.60.214, partition 15,PROCESS_LOCAL, 2170 bytes) 16 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Finished task 12.0 in stage 1.0 (TID 15) in 5668 ms on 172.31.60.214 (13/400) 17 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Starting task 16.0 in stage 1.0 (TID 19, 172.31.51.194, partition 16,PROCESS_LOCAL, 2170 bytes) 18 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Finished task 13.0 in stage 1.0 (TID 16) in 5627 ms on 172.31.51.194 (14/400) 19 | 15/12/03 14:01:56 INFO scheduler.TaskSetManager: Starting task 17.0 in stage 1.0 (TID 20, 172.31.54.237, partition 17,PROCESS_LOCAL, 2170 bytes) 20 | 15/12/03 14:01:56 INFO scheduler.TaskSetManager: Finished task 14.0 in stage 1.0 (TID 17) in 5621 ms on 172.31.54.237 (15/400) 21 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Starting task 18.0 in stage 1.0 (TID 21, 172.31.60.214, partition 18,PROCESS_LOCAL, 2170 bytes) 22 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Finished task 15.0 in stage 1.0 (TID 18) in 5665 ms on 172.31.60.214 (16/400) 23 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Starting task 19.0 in stage 1.0 (TID 22, 172.31.51.194, partition 19,PROCESS_LOCAL, 2170 bytes) 24 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Finished task 16.0 in stage 1.0 (TID 19) in 5621 ms on 172.31.51.194 (17/400) 25 | ``` 26 | 27 | After around 50 tasks you should see some nodes spinning up in the ec2 dashboard (https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Instances:securityGroupName=spark-ec2classic-slaves;sort=instanceId) 28 | 29 | After around 120 tasks you should see some messages about adding executors: 30 | 31 | ``` 32 | 15/12/03 14:05:17 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/3 on worker-20151203140515-172.31.61.144-7078 (172.31.61.144:7078) with 1 cores 33 | 15/12/03 14:05:17 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/3 on hostPort 172.31.61.144:7078 with 1 cores, 1024.0 MB RAM 34 | 15/12/03 14:05:17 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/3 is now RUNNING 35 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Starting task 123.0 in stage 1.0 (TID 126, 172.31.51.194, partition 123,PROCESS_LOCAL, 2170 bytes) 36 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Finished task 120.0 in stage 1.0 (TID 123) in 5628 ms on 172.31.51.194 (121/400) 37 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Starting task 124.0 in stage 1.0 (TID 127, 172.31.54.237, partition 124,PROCESS_LOCAL, 2170 bytes) 38 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Finished task 121.0 in stage 1.0 (TID 124) in 5567 ms on 172.31.54.237 (122/400) 39 | 15/12/03 14:05:18 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/3 is now LOADING 40 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/4 on worker-20151203140520-172.31.61.145-7078 (172.31.61.145:7078) with 1 cores 41 | 15/12/03 14:05:22 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/4 on hostPort 172.31.61.145:7078 with 1 cores, 1024.0 MB RAM 42 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/4 is now RUNNING 43 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/4 is now LOADING 44 | 15/12/03 14:05:22 INFO scheduler.TaskSetManager: Starting task 125.0 in stage 1.0 (TID 128, 172.31.60.214, partition 125,PROCESS_LOCAL, 2170 bytes) 45 | 15/12/03 14:05:22 INFO scheduler.TaskSetManager: Finished task 122.0 in stage 1.0 (TID 125) in 6466 ms on 172.31.60.214 (123/400) 46 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/5 on worker-20151203140521-172.31.61.146-7078 (172.31.61.146:7078) with 1 cores 47 | 15/12/03 14:05:22 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/5 on hostPort 172.31.61.146:7078 with 1 cores, 1024.0 MB RAM 48 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/5 is now RUNNING 49 | 15/12/03 14:05:23 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/5 is now LOADING 50 | ``` 51 | 52 | Then you should start to see more distinct IP addresses in the log entires and the log entries should be printed at a perceivably faster rate (every second). Your job is now using more nodes! 53 | 54 | ``` 55 | 15/12/03 14:07:36 INFO scheduler.TaskSetManager: Starting task 302.0 in stage 1.0 (TID 305, 172.31.60.214, partition 302,PROCESS_LOCAL, 2170 bytes) 56 | 15/12/03 14:07:36 INFO scheduler.TaskSetManager: Finished task 294.0 in stage 1.0 (TID 297) in 5641 ms on 172.31.60.214 (295/400) 57 | 15/12/03 14:07:37 INFO scheduler.TaskSetManager: Starting task 303.0 in stage 1.0 (TID 306, 172.31.61.142, partition 303,PROCESS_LOCAL, 2170 bytes) 58 | 15/12/03 14:07:37 INFO scheduler.TaskSetManager: Finished task 295.0 in stage 1.0 (TID 298) in 5561 ms on 172.31.61.142 (296/400) 59 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 304.0 in stage 1.0 (TID 307, 172.31.51.194, partition 304,PROCESS_LOCAL, 2170 bytes) 60 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 296.0 in stage 1.0 (TID 299) in 5744 ms on 172.31.51.194 (297/400) 61 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 305.0 in stage 1.0 (TID 308, 172.31.54.237, partition 305,PROCESS_LOCAL, 2170 bytes) 62 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 297.0 in stage 1.0 (TID 300) in 5611 ms on 172.31.54.237 (298/400) 63 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 306.0 in stage 1.0 (TID 309, 172.31.61.144, partition 306,PROCESS_LOCAL, 2170 bytes) 64 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 298.0 in stage 1.0 (TID 301) in 5790 ms on 172.31.61.144 (299/400) 65 | 15/12/03 14:07:40 INFO scheduler.TaskSetManager: Starting task 307.0 in stage 1.0 (TID 310, 172.31.61.143, partition 307,PROCESS_LOCAL, 2170 bytes) 66 | 15/12/03 14:07:40 INFO scheduler.TaskSetManager: Finished task 299.0 in stage 1.0 (TID 302) in 5749 ms on 172.31.61.143 (300/400) 67 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Starting task 308.0 in stage 1.0 (TID 311, 172.31.61.145, partition 308,PROCESS_LOCAL, 2170 bytes) 68 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Finished task 301.0 in stage 1.0 (TID 304) in 5402 ms on 172.31.61.145 (301/400) 69 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Starting task 309.0 in stage 1.0 (TID 312, 172.31.61.146, partition 309,PROCESS_LOCAL, 2170 bytes) 70 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Finished task 300.0 in stage 1.0 (TID 303) in 5634 ms on 172.31.61.146 (302/400) 71 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Starting task 310.0 in stage 1.0 (TID 313, 172.31.60.214, partition 310,PROCESS_LOCAL, 2170 bytes) 72 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Finished task 302.0 in stage 1.0 (TID 305) in 5676 ms on 172.31.60.214 (303/400) 73 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Starting task 311.0 in stage 1.0 (TID 314, 172.31.61.142, partition 311,PROCESS_LOCAL, 2170 bytes) 74 | ``` 75 | 76 | A minute or two later and it finsihes 77 | 78 | ``` 79 | 15/12/03 14:08:49 INFO scheduler.DAGScheduler: Job 0 finished: reduce at :16, took 452.183523 s 80 | res0: Int = -1300313216 81 | ``` 82 | 83 | Now try running the exact same command again (promptly after the other command), you will see the job end faster since the whole job will be running with more resources rather than just part of it: 84 | 85 | ``` 86 | 15/12/03 14:16:57 INFO scheduler.DAGScheduler: Job 1 finished: reduce at :16, took 282.684993 s 87 | res1: Int = -1300313216 88 | ``` 89 | 90 | Finally, go make a cup of tea or something and when you return you will see that some of the instances have been terminated, wait for a few more minutes and it will be back to just 2. 91 | 92 | http://i.imgur.com/fBLF2EN.gif 93 | 94 | -------------------------------------------------------------------------------- /ami-packer/ansible/roles/cdh5/templates/hadoop/log4j.properties.j2: -------------------------------------------------------------------------------- 1 | # Copyright 2011 The Apache Software Foundation 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Define some default values that can be overridden by system properties 20 | hadoop.root.logger=INFO,console 21 | hadoop.log.dir=. 22 | hadoop.log.file=hadoop.log 23 | 24 | # Define the root logger to the system property "hadoop.root.logger". 25 | log4j.rootLogger=${hadoop.root.logger}, EventCounter 26 | 27 | # Logging Threshold 28 | log4j.threshold=ALL 29 | 30 | # Null Appender 31 | log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender 32 | 33 | # 34 | # Rolling File Appender - cap space usage at 5gb. 35 | # 36 | hadoop.log.maxfilesize=256MB 37 | hadoop.log.maxbackupindex=20 38 | log4j.appender.RFA=org.apache.log4j.RollingFileAppender 39 | log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} 40 | 41 | log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize} 42 | log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex} 43 | 44 | log4j.appender.RFA.layout=org.apache.log4j.PatternLayout 45 | 46 | # Pattern format: Date LogLevel LoggerName LogMessage 47 | log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 48 | # Debugging Pattern format 49 | #log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n 50 | 51 | 52 | # 53 | # Daily Rolling File Appender 54 | # 55 | 56 | log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender 57 | log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} 58 | 59 | # Rollver at midnight 60 | log4j.appender.DRFA.DatePattern=.yyyy-MM-dd 61 | 62 | # 30-day backup 63 | #log4j.appender.DRFA.MaxBackupIndex=30 64 | log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout 65 | 66 | # Pattern format: Date LogLevel LoggerName LogMessage 67 | log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 68 | # Debugging Pattern format 69 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n 70 | 71 | 72 | # 73 | # console 74 | # Add "console" to rootlogger above if you want to use this 75 | # 76 | 77 | log4j.appender.console=org.apache.log4j.ConsoleAppender 78 | log4j.appender.console.target=System.err 79 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 80 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 81 | 82 | # 83 | # TaskLog Appender 84 | # 85 | 86 | #Default values 87 | hadoop.tasklog.taskid=null 88 | hadoop.tasklog.iscleanup=false 89 | hadoop.tasklog.noKeepSplits=4 90 | hadoop.tasklog.totalLogFileSize=100 91 | hadoop.tasklog.purgeLogSplits=true 92 | hadoop.tasklog.logsRetainHours=12 93 | 94 | log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender 95 | log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} 96 | log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup} 97 | log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} 98 | 99 | log4j.appender.TLA.layout=org.apache.log4j.PatternLayout 100 | log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 101 | 102 | # 103 | # HDFS block state change log from block manager 104 | # 105 | # Uncomment the following to suppress normal block state change 106 | # messages from BlockManager in NameNode. 107 | #log4j.logger.BlockStateChange=WARN 108 | 109 | # 110 | #Security appender 111 | # 112 | hadoop.security.logger=INFO,NullAppender 113 | hadoop.security.log.maxfilesize=256MB 114 | hadoop.security.log.maxbackupindex=20 115 | log4j.category.SecurityLogger=${hadoop.security.logger} 116 | hadoop.security.log.file=SecurityAuth-${user.name}.audit 117 | log4j.appender.RFAS=org.apache.log4j.RollingFileAppender 118 | log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file} 119 | log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout 120 | log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 121 | log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize} 122 | log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex} 123 | 124 | # 125 | # Daily Rolling Security appender 126 | # 127 | log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender 128 | log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file} 129 | log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout 130 | log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 131 | log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd 132 | 133 | # 134 | # hadoop configuration logging 135 | # 136 | 137 | # Uncomment the following line to turn off configuration deprecation warnings. 138 | # log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN 139 | 140 | # 141 | # hdfs audit logging 142 | # 143 | hdfs.audit.logger=INFO,NullAppender 144 | hdfs.audit.log.maxfilesize=256MB 145 | hdfs.audit.log.maxbackupindex=20 146 | log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} 147 | log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false 148 | log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender 149 | log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log 150 | log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout 151 | log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n 152 | log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize} 153 | log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex} 154 | 155 | # 156 | # mapred audit logging 157 | # 158 | mapred.audit.logger=INFO,NullAppender 159 | mapred.audit.log.maxfilesize=256MB 160 | mapred.audit.log.maxbackupindex=20 161 | log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger} 162 | log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false 163 | log4j.appender.MRAUDIT=org.apache.log4j.RollingFileAppender 164 | log4j.appender.MRAUDIT.File=${hadoop.log.dir}/mapred-audit.log 165 | log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout 166 | log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n 167 | log4j.appender.MRAUDIT.MaxFileSize=${mapred.audit.log.maxfilesize} 168 | log4j.appender.MRAUDIT.MaxBackupIndex=${mapred.audit.log.maxbackupindex} 169 | 170 | # Custom Logging levels 171 | 172 | #log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG 173 | #log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG 174 | #log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=DEBUG 175 | 176 | # Jets3t library 177 | log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR 178 | 179 | # 180 | # Event Counter Appender 181 | # Sends counts of logging messages at different severity levels to Hadoop Metrics. 182 | # 183 | log4j.appender.EventCounter=org.apache.hadoop.log.metrics.EventCounter 184 | 185 | # 186 | # Job Summary Appender 187 | # 188 | # Use following logger to send summary to separate file defined by 189 | # hadoop.mapreduce.jobsummary.log.file : 190 | # hadoop.mapreduce.jobsummary.logger=INFO,JSA 191 | # 192 | hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger} 193 | hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log 194 | hadoop.mapreduce.jobsummary.log.maxfilesize=256MB 195 | hadoop.mapreduce.jobsummary.log.maxbackupindex=20 196 | log4j.appender.JSA=org.apache.log4j.RollingFileAppender 197 | log4j.appender.JSA.File=${hadoop.log.dir}/${hadoop.mapreduce.jobsummary.log.file} 198 | log4j.appender.JSA.MaxFileSize=${hadoop.mapreduce.jobsummary.log.maxfilesize} 199 | log4j.appender.JSA.MaxBackupIndex=${hadoop.mapreduce.jobsummary.log.maxbackupindex} 200 | log4j.appender.JSA.layout=org.apache.log4j.PatternLayout 201 | log4j.appender.JSA.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 202 | log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${hadoop.mapreduce.jobsummary.logger} 203 | log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false 204 | 205 | # 206 | # Yarn ResourceManager Application Summary Log 207 | # 208 | # Set the ResourceManager summary log filename 209 | yarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log 210 | # Set the ResourceManager summary log level and appender 211 | yarn.server.resourcemanager.appsummary.logger=${hadoop.root.logger} 212 | #yarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY 213 | 214 | # To enable AppSummaryLogging for the RM, 215 | # set yarn.server.resourcemanager.appsummary.logger to 216 | # ,RMSUMMARY in hadoop-env.sh 217 | 218 | # Appender for ResourceManager Application Summary Log 219 | # Requires the following properties to be set 220 | # - hadoop.log.dir (Hadoop Log directory) 221 | # - yarn.server.resourcemanager.appsummary.log.file (resource manager app summary log filename) 222 | # - yarn.server.resourcemanager.appsummary.logger (resource manager app summary log level and appender) 223 | 224 | log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger} 225 | log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false 226 | log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender 227 | log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file} 228 | log4j.appender.RMSUMMARY.MaxFileSize=256MB 229 | log4j.appender.RMSUMMARY.MaxBackupIndex=20 230 | log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout 231 | log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n 232 | 233 | # HS audit log configs 234 | #mapreduce.hs.audit.logger=INFO,HSAUDIT 235 | #log4j.logger.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=${mapreduce.hs.audit.logger} 236 | #log4j.additivity.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=false 237 | #log4j.appender.HSAUDIT=org.apache.log4j.DailyRollingFileAppender 238 | #log4j.appender.HSAUDIT.File=${hadoop.log.dir}/hs-audit.log 239 | #log4j.appender.HSAUDIT.layout=org.apache.log4j.PatternLayout 240 | #log4j.appender.HSAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n 241 | #log4j.appender.HSAUDIT.DatePattern=.yyyy-MM-dd 242 | 243 | # Http Server Request Logs 244 | #log4j.logger.http.requests.namenode=INFO,namenoderequestlog 245 | #log4j.appender.namenoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender 246 | #log4j.appender.namenoderequestlog.Filename=${hadoop.log.dir}/jetty-namenode-yyyy_mm_dd.log 247 | #log4j.appender.namenoderequestlog.RetainDays=3 248 | 249 | #log4j.logger.http.requests.datanode=INFO,datanoderequestlog 250 | #log4j.appender.datanoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender 251 | #log4j.appender.datanoderequestlog.Filename=${hadoop.log.dir}/jetty-datanode-yyyy_mm_dd.log 252 | #log4j.appender.datanoderequestlog.RetainDays=3 253 | 254 | #log4j.logger.http.requests.resourcemanager=INFO,resourcemanagerrequestlog 255 | #log4j.appender.resourcemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender 256 | #log4j.appender.resourcemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-resourcemanager-yyyy_mm_dd.log 257 | #log4j.appender.resourcemanagerrequestlog.RetainDays=3 258 | 259 | #log4j.logger.http.requests.jobhistory=INFO,jobhistoryrequestlog 260 | #log4j.appender.jobhistoryrequestlog=org.apache.hadoop.http.HttpRequestLogAppender 261 | #log4j.appender.jobhistoryrequestlog.Filename=${hadoop.log.dir}/jetty-jobhistory-yyyy_mm_dd.log 262 | #log4j.appender.jobhistoryrequestlog.RetainDays=3 263 | 264 | #log4j.logger.http.requests.nodemanager=INFO,nodemanagerrequestlog 265 | #log4j.appender.nodemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender 266 | #log4j.appender.nodemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-nodemanager-yyyy_mm_dd.log 267 | #log4j.appender.nodemanagerrequestlog.RetainDays=3 268 | -------------------------------------------------------------------------------- /spark-cloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # pylint: disable=line-too-long 3 | # idea based on https://github.com/apache/spark/blob/master/ec2/spark_ec2.py 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | import random 6 | 7 | import sys 8 | import time 9 | import socket 10 | from optparse import OptionParser 11 | from datetime import datetime 12 | 13 | import boto 14 | import boto.ec2.autoscale 15 | import boto.ec2.cloudwatch 16 | from boto.ec2.autoscale import LaunchConfiguration, AutoScalingGroup, ScalingPolicy 17 | from boto.ec2.cloudwatch import MetricAlarm 18 | from boto.ec2.networkinterface import NetworkInterfaceSpecification, NetworkInterfaceCollection 19 | 20 | 21 | def get_group(conn, name): 22 | """ 23 | Get the EC2 security group of the given name 24 | """ 25 | groups = conn.get_all_security_groups() 26 | return [g for g in groups if g.name == name] 27 | 28 | def get_or_make_group(conn, name, vpc_id): 29 | """ 30 | Get the EC2 security group of the given name, creating it if it doesn't exist 31 | """ 32 | groups = conn.get_all_security_groups() 33 | group = [g for g in groups if g.name == name] 34 | if len(group) > 0: 35 | return group[0] 36 | else: 37 | print("Creating security group " + name) 38 | return conn.create_security_group(name, "Spark Cloud group", vpc_id) 39 | 40 | 41 | def wait_for_tcp_port(host, port=22): 42 | sys.stdout.write( 43 | "Waiting for port {port} to be available".format(port=port) 44 | ) 45 | sys.stdout.flush() 46 | start_time = datetime.now() 47 | while True: 48 | sys.stdout.write(".") 49 | sys.stdout.flush() 50 | s = socket.socket() 51 | result = s.connect_ex((host, port)) 52 | if result == 0: 53 | s.close() 54 | break 55 | time.sleep(5) 56 | end_time = datetime.now() 57 | print("Port {port} is now available. Waited {t} seconds.".format( 58 | port=port, 59 | t=(end_time - start_time).seconds 60 | )) 61 | 62 | 63 | def wait_for_cluster_state(conn, cluster_instances, cluster_state="running", name="master"): 64 | """ 65 | Wait for all the instances in the cluster to reach a designated state. 66 | cluster_instances: a list of boto.ec2.instance.Instance 67 | cluster_state: a string representing the desired state of all the instances in the cluster 68 | value can be valid value from boto.ec2.instance.InstanceState such as 69 | 'running', 'terminated', etc. 70 | """ 71 | sys.stdout.write( 72 | "Waiting for {n} to enter '{s}' state.".format(n=name, s=cluster_state) 73 | ) 74 | sys.stdout.flush() 75 | start_time = datetime.now() 76 | while True: 77 | for i in cluster_instances: 78 | i.update() 79 | max_batch = 100 80 | statuses = [] 81 | for j in xrange(0, len(cluster_instances), max_batch): 82 | batch = [i.id for i in cluster_instances[j:j + max_batch]] 83 | statuses.extend(conn.get_all_instance_status(instance_ids=batch)) 84 | if all(i.state == cluster_state for i in cluster_instances): 85 | break 86 | sys.stdout.write(".") 87 | sys.stdout.flush() 88 | time.sleep(5) 89 | sys.stdout.write("\n") 90 | end_time = datetime.now() 91 | print("Cluster is now in '{s}' state. Waited {t} seconds.".format( 92 | s=cluster_state, 93 | t=(end_time - start_time).seconds 94 | )) 95 | 96 | 97 | def setup_security_groups(conn, cluster_name, opts): 98 | print("Setting up security groups...") 99 | master_group = get_or_make_group( 100 | conn, cluster_name + "-master", opts.vpc_id) 101 | worker_group = get_or_make_group( 102 | conn, cluster_name + "-workers", opts.vpc_id) 103 | authorized_address = opts.authorized_address 104 | if master_group.rules == []: # Group was just now created 105 | if opts.vpc_id is None: 106 | master_group.authorize(src_group=master_group) 107 | master_group.authorize(src_group=worker_group) 108 | else: 109 | master_group.authorize(ip_protocol='-1', from_port=None, to_port=None, 110 | src_group=master_group) 111 | master_group.authorize(ip_protocol='-1', from_port=None, to_port=None, 112 | src_group=worker_group) 113 | master_group.authorize('tcp', 0, 65535, authorized_address) 114 | if worker_group.rules == []: # Group was just now created 115 | if opts.vpc_id is None: 116 | worker_group.authorize(src_group=master_group) 117 | worker_group.authorize(src_group=worker_group) 118 | else: 119 | worker_group.authorize(ip_protocol='-1', from_port=None, to_port=None, 120 | src_group=master_group) 121 | worker_group.authorize(ip_protocol='-1', from_port=None, to_port=None, 122 | src_group=worker_group) 123 | worker_group.authorize('tcp', 0, 65535, authorized_address) 124 | return (master_group, worker_group) 125 | 126 | 127 | def delete_security_groups(conn, cluster_name): 128 | print("Deleting security groups...") 129 | master_group = get_group(conn, cluster_name + "-master") 130 | # TODO: deprecate this in Jan 2016 131 | slave_group = get_group(conn, cluster_name + "-slaves") 132 | worker_group = get_group(conn, cluster_name + "-workers") 133 | groups = master_group + worker_group + slave_group 134 | success = True 135 | for group in groups: 136 | print("Deleting rules in security group " + group.name) 137 | for rule in group.rules: 138 | for grant in rule.grants: 139 | success &= conn.revoke_security_group(group_id=group.id, ip_protocol=rule.ip_protocol, 140 | from_port=rule.from_port, to_port=rule.to_port, 141 | src_security_group_group_id=grant.group_id, cidr_ip=grant.cidr_ip) 142 | time.sleep(2) 143 | for group in groups: 144 | try: 145 | conn.delete_security_group(group_id=group.id) 146 | print("Deleted security group %s" % group.name) 147 | except boto.exception.EC2ResponseError, e: 148 | success = False 149 | print("Failed to delete security group %s" % group.name) 150 | print(e) 151 | if not success: 152 | print("Failed to delete all security groups, try again later") 153 | 154 | 155 | def parse_options(): 156 | parser = OptionParser( 157 | usage="%prog [options] \n\n" 158 | + " can be: launch, destroy") 159 | parser.add_option( 160 | "-k", "--key-pair", default=None, 161 | help="Key pair to use on instances") 162 | parser.add_option( 163 | "-i", "--identity-file", 164 | help="SSH private key file to use for logging into instances") 165 | parser.add_option( 166 | "-r", "--region", default="us-east-1", 167 | help="EC2 region used to launch instances in, or to find them in (default: %default)") 168 | parser.add_option( 169 | "-a", "--ami", default="ami-cdf3bea7", 170 | help="Amazon Machine Image ID to use") 171 | parser.add_option( 172 | "--authorized-address", type="string", default="0.0.0.0/0", 173 | help="Address to authorize on created security groups (default: %default)") 174 | parser.add_option( 175 | "-t", "--instance-type", default="m3.medium", 176 | help="Type of instance to launch (default: %default). ") 177 | parser.add_option( 178 | "-m", "--master-instance-type", default="m3.medium", 179 | help="Master instance type (default: %default)") 180 | 181 | parser.add_option( 182 | "-u", "--scale-up-nodes-amount", type="int", default="5", 183 | help="Number of nodes to scale up by when scale up alarm is triggered (default %default)") 184 | parser.add_option( 185 | "-d", "--scale-down-nodes-amount", type="int", default="1", 186 | help="Number of nodes to scale down by when scale down alarm is triggered (default %default)") 187 | 188 | parser.add_option( 189 | "-U", "--scale-up-cooldown", type="int", default="60", 190 | help="The amount of time, in seconds, after a scale up activity completes before any further " + 191 | "scale up actions can occur. See the following link for more information " + 192 | "http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/Cooldown.html#cooldowns-scaling-specific (default %default)") 193 | parser.add_option( 194 | "-D", "--scale-down-cooldown", type="int", default="60", 195 | help="The amount of time, in seconds, after a scale down activity completes before any further " + 196 | "scale down actions can occur. See the following link for more information " + 197 | "http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/Cooldown.html#cooldowns-scaling-specific (default %default)") 198 | 199 | parser.add_option( 200 | "-n", "--min-instances", type="int", default="2", 201 | help="Minimum number of instances for the auto-scaling group (default %default)") 202 | parser.add_option( 203 | "-x", "--max-instances", type="int", default="8", 204 | help="Maximum number of instances for the auto-scaling group (default %default)") 205 | 206 | parser.add_option( 207 | "--max-spot-price", metavar="PRICE", type="float", 208 | help="If specified, launch workers as spot instances with the given " + 209 | "maximum price (in dollars). The actual price paid will be the market price " + 210 | "so potentially less than this value. If the market price exceeds this value " + 211 | "your nodes will shut down and no more will spin up") 212 | parser.add_option( 213 | "--subnet-id", default=None, 214 | help="VPC subnet to launch instances in") 215 | parser.add_option( 216 | "--vpc-id", default=None, 217 | help="VPC to launch instances in") 218 | parser.add_option( 219 | "-z", "--zone", default=None, 220 | help="Availability zone to launch instances in") 221 | 222 | (opts, args) = parser.parse_args() 223 | if len(args) != 2: 224 | parser.print_help() 225 | sys.exit(1) 226 | 227 | (action, cluster_name) = args 228 | return (opts, action, cluster_name) 229 | 230 | 231 | def find_instance_by_name(conn, name): 232 | reservations = conn.get_all_instances() 233 | instances = [i for r in reservations for i in r.instances] 234 | for i in instances: 235 | if "Name" in i.tags and name == i.tags['Name']: 236 | return i 237 | return None 238 | 239 | 240 | def start_master(conn, opts, cluster_name, master_group): 241 | try: 242 | conn.get_all_images(image_ids=[opts.ami])[0] 243 | except boto.exception.EC2ResponseError: 244 | print("Could not find AMI " + opts.ami) 245 | sys.exit(1) 246 | if opts.vpc_id: 247 | interface = NetworkInterfaceSpecification(subnet_id=opts.subnet_id, 248 | groups=[master_group.id], 249 | associate_public_ip_address=True) 250 | interfaces = NetworkInterfaceCollection(interface) 251 | security_group_ids = None 252 | else: 253 | interfaces = None 254 | security_group_ids = [master_group.id] 255 | master_res = conn.run_instances( 256 | image_id=opts.ami, 257 | key_name=opts.key_pair, 258 | instance_type=opts.master_instance_type, 259 | placement=opts.zone, 260 | min_count=1, 261 | max_count=1, 262 | network_interfaces=interfaces, 263 | security_group_ids=security_group_ids) 264 | instance = master_res.instances[0] 265 | time.sleep(1) 266 | conn.create_tags( 267 | [instance.id], {"Name": "{c}-master".format(c=cluster_name)}) 268 | return instance 269 | 270 | 271 | def validate_opts(conn, opts, action): 272 | if opts.zone is None and opts.vpc_id is None: 273 | opts.zone = random.choice(conn.get_all_zones()).name 274 | if opts.vpc_id is not None and opts.zone is None: 275 | print("please specify zone with vpc_id") 276 | sys.exit(1) 277 | if opts.zone is None: 278 | print("please specify zone (--zone)") 279 | sys.exit(1) 280 | if action == "launch": 281 | if opts.key_pair is None: 282 | print("please specify keypair (-k)") 283 | sys.exit(1) 284 | return opts 285 | 286 | 287 | def create_autoscaling_group(autoscale, cluster_name, master_node, opts, slave_group): 288 | lclist = autoscale.get_all_launch_configurations( 289 | names=[cluster_name + "-lc"]) 290 | if lclist: 291 | lc = lclist[0] 292 | else: 293 | lc = LaunchConfiguration( 294 | name=cluster_name + "-lc", 295 | image_id=opts.ami, 296 | key_name=opts.key_pair, 297 | security_groups=[slave_group.id], 298 | instance_type=opts.instance_type, 299 | user_data="SPARK_MASTER=" + master_node.private_dns_name + "\n", 300 | instance_monitoring=True, 301 | spot_price=opts.max_spot_price) 302 | autoscale.create_launch_configuration(lc) 303 | aglist = autoscale.get_all_groups(names=[cluster_name + "-ag"]) 304 | if aglist: 305 | ag = aglist[0] 306 | else: 307 | ag = AutoScalingGroup(group_name=cluster_name + "-ag", 308 | launch_config=lc, 309 | min_size=opts.min_instances, 310 | max_size=opts.max_instances, 311 | connection=autoscale, 312 | vpc_zone_identifier=opts.subnet_id, 313 | availability_zones=[opts.zone]) 314 | autoscale.create_auto_scaling_group(ag) 315 | as_tag = boto.ec2.autoscale.Tag(key='Name', 316 | value=cluster_name + '-worker', 317 | propagate_at_launch=True, 318 | resource_id=cluster_name + "-ag") 319 | autoscale.create_or_update_tags([as_tag]) 320 | 321 | 322 | def create_autoscaling_policy(autoscale, cluster_name, opts): 323 | scale_up_policy = ScalingPolicy( 324 | name='scale_up', adjustment_type='ChangeInCapacity', 325 | as_name=cluster_name + "-ag", scaling_adjustment=opts.scale_up_nodes_amount, cooldown=opts.scale_up_cooldown) 326 | scale_down_policy = ScalingPolicy( 327 | name='scale_down', adjustment_type='ChangeInCapacity', 328 | as_name=cluster_name + "-ag", scaling_adjustment=-opts.scale_down_nodes_amount, cooldown=opts.scale_down_cooldown) 329 | autoscale.create_scaling_policy(scale_up_policy) 330 | autoscale.create_scaling_policy(scale_down_policy) 331 | scale_up_policy = autoscale.get_all_policies( 332 | as_group=cluster_name + "-ag", policy_names=['scale_up'])[0] 333 | scale_down_policy = autoscale.get_all_policies( 334 | as_group=cluster_name + "-ag", policy_names=['scale_down'])[0] 335 | alarm_dimensions = {"AutoScalingGroupName": cluster_name + "-ag"} 336 | cloudwatch = boto.ec2.cloudwatch.connect_to_region(opts.region) 337 | scale_up_alarm = MetricAlarm( 338 | name='scale_up_on_cpu', namespace='AWS/EC2', 339 | metric='CPUUtilization', statistic='Average', 340 | comparison='>', threshold='50', 341 | period='60', evaluation_periods=1, 342 | alarm_actions=[scale_up_policy.policy_arn], 343 | dimensions=alarm_dimensions) 344 | cloudwatch.create_alarm(scale_up_alarm) 345 | scale_down_alarm = MetricAlarm( 346 | name='scale_down_on_cpu', namespace='AWS/EC2', 347 | metric='CPUUtilization', statistic='Average', 348 | comparison='<', threshold='40', 349 | period='60', evaluation_periods=1, 350 | alarm_actions=[scale_down_policy.policy_arn], 351 | dimensions=alarm_dimensions) 352 | cloudwatch.create_alarm(scale_down_alarm) 353 | 354 | 355 | def main(): 356 | (opts, action, cluster_name) = parse_options() 357 | conn = boto.ec2.connect_to_region(opts.region) 358 | opts = validate_opts(conn, opts, action) 359 | 360 | if action == "launch": 361 | (master_group, slave_group) = setup_security_groups(conn, cluster_name, opts) 362 | master_node = find_instance_by_name(conn, cluster_name + '-master') 363 | if not master_node: 364 | master_node = start_master(conn, opts, cluster_name, master_group) 365 | print("Master node: {m}".format(m=master_node)) 366 | wait_for_cluster_state( 367 | conn=conn, 368 | cluster_instances=([master_node]), 369 | ) 370 | autoscale = boto.ec2.autoscale.connect_to_region(opts.region) 371 | create_autoscaling_group(autoscale, cluster_name, master_node, opts, slave_group) 372 | create_autoscaling_policy(autoscale, cluster_name, opts) 373 | 374 | wait_for_tcp_port(master_node.public_dns_name) 375 | print("SSH ready:") 376 | print("ssh ubuntu@{h}".format(h=master_node.public_dns_name)) 377 | wait_for_tcp_port(master_node.public_dns_name, port=18080) 378 | print("Spark master ready:") 379 | print( 380 | "Spark WebUI: http://{h}:18080".format(h=master_node.public_dns_name)) 381 | if action == "destroy": 382 | master_node = find_instance_by_name(conn, cluster_name + '-master') 383 | if master_node: 384 | print("Terminating master...") 385 | conn.create_tags([master_node.id], {"Name": "{c}-master-terminated".format(c=cluster_name)}) 386 | master_node.terminate() 387 | print("Shutting down autoscaling group...") 388 | autoscale = boto.ec2.autoscale.connect_to_region(opts.region) 389 | aglist = autoscale.get_all_groups(names=[cluster_name + "-ag"]) 390 | ag = None 391 | if aglist: 392 | ag = aglist[0] 393 | ag.shutdown_instances() 394 | instances_ids = [i.instance_id for i in ag.instances] 395 | instances = conn.get_only_instances(instances_ids) 396 | else: 397 | instances = [] 398 | lclist = autoscale.get_all_launch_configurations(names=[cluster_name + "-lc"]) 399 | lc = None 400 | if lclist: 401 | lc = lclist[0] 402 | wait_for_cluster_state( 403 | conn, instances, cluster_state="terminated", name="instances") 404 | time.sleep(10) 405 | if ag: 406 | try: 407 | ag.delete() 408 | except Exception, e: 409 | print("Couldn't delete autoscaling group: %s" % e) 410 | if lc: 411 | try: 412 | lc.delete() 413 | except Exception, e: 414 | print("Couldn't delete launch configuration: %s" % e) 415 | delete_security_groups(conn, cluster_name) 416 | print("All done.") 417 | 418 | 419 | if __name__ == "__main__": 420 | main() 421 | --------------------------------------------------------------------------------