├── requirements.txt
├── .gitignore
├── ami-packer
    ├── ansible
    │   ├── roles
    │   │   ├── cloudinit
    │   │   │   ├── templates
    │   │   │   │   ├── 10_bootcmd.cfg
    │   │   │   │   └── configure-mdraid.sh
    │   │   │   └── tasks
    │   │   │   │   └── main.yml
    │   │   ├── spark
    │   │   │   ├── templates
    │   │   │   │   ├── spark-alias.sh.j2
    │   │   │   │   ├── spark-master.j2
    │   │   │   │   ├── spark-worker.j2
    │   │   │   │   └── spark-env.sh.j2
    │   │   │   └── tasks
    │   │   │   │   └── main.yml
    │   │   ├── openjdk
    │   │   │   └── tasks
    │   │   │   │   └── main.yaml
    │   │   ├── common
    │   │   │   └── tasks
    │   │   │   │   └── main.yaml
    │   │   └── cdh5
    │   │   │   ├── templates
    │   │   │       └── hadoop
    │   │   │       │   ├── core-site.xml.j2
    │   │   │       │   └── log4j.properties.j2
    │   │   │   └── tasks
    │   │   │       └── main.yml
    │   └── main.yml
    ├── scripts
    │   ├── cleanup.sh
    │   └── setup_ansible.sh
    └── packer.json
├── tests
    ├── test-spark-app
    │   ├── build.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── test
    │   │               └── app
    │   │                   └── AddLotsOfNumbers.scala
    └── test-spark-submit-works-simple-jar.sh
├── README.md
├── autoscaling-demo.md
└── spark-cloud.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | boto>=2.20.1
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | tests/test-spark-app/project*
3 | tests/test-spark-app/.idea*
4 | tests/test-spark-app/target*
5 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cloudinit/templates/10_bootcmd.cfg:
--------------------------------------------------------------------------------
1 | # commands to run on each boot
2 | bootcmd:
3 |   - /etc/cloud/configure-mdraid.sh
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   sudo: yes
 4 |   gather_facts: yes
 5 |   roles:
 6 |     - cloudinit
 7 |     - common
 8 |     - openjdk
 9 |     - cdh5
10 |     - spark
11 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/spark/templates/spark-alias.sh.j2:
--------------------------------------------------------------------------------
1 | alias spark-shell="spark-shell --master `cat /etc/spark/conf/cluster-url`"
2 | alias spark-submit="spark-submit --master `cat /etc/spark/conf/cluster-url`"
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/test-spark-app/build.sbt:
--------------------------------------------------------------------------------
 1 | val companyName = "app"
 2 | 
 3 | val domain = "test"
 4 | 
 5 | val projectName = "spark-cluster-launch-test"
 6 | 
 7 | name := projectName
 8 | 
 9 | scalaVersion := "2.10.4"
10 | 
11 | val sparkVersion = "1.5.1"
12 | 
13 | libraryDependencies ++= Seq(
14 |   "org.apache.spark" %% "spark-core" % sparkVersion % "provided" withSources() withJavadoc()
15 | )
16 | 
17 | organization := domain + "." + companyName


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/openjdk/tasks/main.yaml:
--------------------------------------------------------------------------------
 1 | - name: add openjdk ppa
 2 |   apt_repository: repo='ppa:openjdk-r/ppa'
 3 | - name: add java repository key
 4 |   shell: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 86F44E2A
 5 | 
 6 | - name: install or update openjdk via apt
 7 |   apt: pkg={{ item }} update_cache=yes
 8 |   with_items:
 9 |        - openjdk-8-jre-headless
10 |        - openjdk-8-jdk
11 |   tags:
12 |     - java
13 | 


--------------------------------------------------------------------------------
/ami-packer/scripts/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get -y autoremove
 4 | sudo apt-get -y clean
 5 | 
 6 | echo "cleaning up dhcp leases"
 7 | sudo rm /var/lib/dhcp/*
 8 | 
 9 | echo "cleaning up udev rules"
10 | sudo rm -f /etc/udev/rules.d/70-persistent-net.rules
11 | sudo mkdir /etc/udev/rules.d/70-persistent-net.rules
12 | sudo rm -rf /dev/.udev/
13 | sudo rm -f /lib/udev/rules.d/75-persistent-net-generator.rules
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cloudinit/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: install required packages
 2 |   apt: pkg={{ item }}
 3 |   with_items:
 4 |   - mdadm
 5 | 
 6 | - name: copy instance store configuration script
 7 |   template: src={{ item }} dest=/etc/cloud/{{ item }} owner=root group=root mode=0755
 8 |   with_items:
 9 |   - configure-mdraid.sh
10 |   
11 | - name: cloudinit bootcmd
12 |   template: src=10_bootcmd.cfg dest=/etc/cloud/cloud.cfg.d/10_bootcmd.cfg owner=root group=root mode=0644
13 | 
14 | 


--------------------------------------------------------------------------------
/ami-packer/scripts/setup_ansible.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # wait for things to settle
 4 | sleep 30
 5 | 
 6 | # do not try to start services after installing
 7 | cat <<EOF | sudo tee /usr/sbin/policy-rc.d
 8 | #!/bin/sh
 9 | exit 101
10 | EOF
11 | sudo chmod 755 /usr/sbin/policy-rc.d
12 | 
13 | echo grub-pc grub2/linux_cmdline string | sudo debconf-set-selections
14 | echo grub-pc grub-pc/install_devices_empty boolean true | sudo debconf-set-selections
15 | 
16 | sudo apt-get update
17 | sudo apt-get -y install python python-pip python-dev
18 | sudo apt-get -y build-dep ansible
19 | sudo pip install ansible
20 | 
21 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/spark/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: install spark
 2 |   apt: pkg={{ item }} update_cache=yes
 3 |   with_items:
 4 |     - spark-master
 5 |     - spark-worker
 6 |     - spark-netlib
 7 |     - spark-python
 8 |     - spark-history-server
 9 | 
10 | - name: copy spark configuration files
11 |   template: src={{ item }}.j2 dest=/etc/spark/conf/{{ item }} owner=spark group=spark mode=0644
12 |   with_items:
13 |   - spark-env.sh
14 | 
15 | - name: copy spark init
16 |   template: src={{ item }}.j2 dest=/etc/init.d/{{ item }} owner=root group=root mode=0755
17 |   with_items:
18 |   - spark-master
19 |   - spark-worker
20 | 
21 | - name: copy spark aliases
22 |   template: src={{ item }}.j2 dest=/etc/profile.d/{{ item }} owner=root group=root mode=0755
23 |   with_items:
24 |   - spark-alias.sh
25 | 
26 | 


--------------------------------------------------------------------------------
/ami-packer/packer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "builders": [
 3 |         {
 4 |             "type": "amazon-ebs",
 5 |             "region": "us-east-1",
 6 |             "source_ami": "ami-d05e75b8",
 7 |             "instance_type": "m3.medium",
 8 |             "ssh_username": "ubuntu",
 9 |             "ami_name": "spark 1.5.0 cdh5.5 {{ timestamp }}",
10 |             "associate_public_ip_address" : true
11 |         }
12 |     ],
13 | 
14 |     "provisioners": [
15 |         {
16 |             "type": "shell",
17 |             "script": "scripts/setup_ansible.sh"
18 |         },
19 |         {
20 |             "type": "ansible-local",
21 |             "playbook_file": "ansible/main.yml",
22 |             "playbook_dir": "ansible/"
23 |         },
24 |         {
25 |             "type": "shell",
26 |             "script": "scripts/cleanup.sh"
27 |         }
28 |     ]
29 | }
30 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/common/tasks/main.yaml:
--------------------------------------------------------------------------------
 1 | - name: install commonly used packages via apt
 2 |   apt: pkg={{ item }} update_cache=true cache_valid_time=3600
 3 |   with_items:
 4 |     - "vim"
 5 |     - "screen"
 6 |     - "python-pip"
 7 |     - "git-core"
 8 |     - "ntp"
 9 |     - "iptraf"
10 |     - "sysstat"
11 | 
12 | - name: apt-get upgrade
13 |   apt: upgrade=full
14 | 
15 | - name: Install python-pycurl for ansible
16 |   apt: pkg=python-pycurl
17 | 
18 | - name: Ntp | make sure NTP is running
19 |   service: name=ntp state=running enabled=yes
20 |   tags: ntp
21 | 
22 | - name: install s3cmd
23 |   pip: name=s3cmd
24 | 
25 | - name: Install additional scientific python packages
26 |   apt: pkg={{ item }}
27 |   with_items:
28 |   - python-scipy
29 |   - python-sklearn
30 |   - python-numpy
31 |   - python-pandas
32 |   - python-matplotlib
33 |   - python-apt
34 | 


--------------------------------------------------------------------------------
/tests/test-spark-app/src/main/scala/test/app/AddLotsOfNumbers.scala:
--------------------------------------------------------------------------------
 1 | package test.app
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.spark.{SparkContext, SparkConf}
 6 | 
 7 | object AddLotsOfNumbers {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val fullPathToFile: String = args(0)
12 | 
13 |     val sc: SparkContext = new SparkContext(new SparkConf().setAppName("Add lots of numbers"))
14 | 
15 |     val numPartitions: Int = 100
16 |     val doSomethingNum: Int = 2000
17 | 
18 |     val count: Int =
19 |       sc.makeRDD(1 to 1000000)
20 |       .repartition(numPartitions)
21 |       .map(_ =>
22 |         (1 to doSomethingNum).map(_.toString.length)
23 |         .sum)
24 |       .reduce(_ + _)
25 | 
26 |     val pw = new java.io.PrintWriter(new File(fullPathToFile))
27 |     try pw.write(count.toString + "\n") finally pw.close()
28 | 
29 |   }
30 | }


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cloudinit/templates/configure-mdraid.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ### Setup instance stores with raid0
 3 | 
 4 | NUM_DEVICES=`find /dev -name 'xvd[b-z]*' | wc -l`
 5 | DEVICES=`find /dev -name 'xvd[b-z]*' -printf '%p\040'`
 6 | 
 7 | mount -l | grep '/dev/md127'
 8 | 
 9 | if [ $? -eq 1 ]; then
10 | 	echo "Mounting /dev/md127"
11 | 
12 | 	for DEVICE in $DEVICES; do
13 | 		umount $DEVICE
14 | 	done
15 | 
16 | 
17 | 	yes | mdadm --create /dev/md127 --name=0 --level=0 -c256 --raid-devices=${NUM_DEVICES} --force $DEVICES
18 | 	echo "DEVICE $DEVICES" > /etc/mdadm.conf
19 | 	mdadm --detail --scan >> /etc/mdadm.conf
20 | 
21 | 	blockdev --setra 65536 /dev/md127
22 | 	mkfs.ext4 /dev/md127
23 | 	mount -t ext4 -o noatime /dev/md127 /mnt
24 | 	mkdir /mnt/tmp
25 | 	chmod -R 777 /mnt
26 | 	chmod 1777 /mnt/tmp
27 | 	mount -o bind /mnt/tmp /tmp
28 | else
29 | 	echo "/dev/md127 already configured"
30 | fi
31 | 
32 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cdh5/templates/hadoop/core-site.xml.j2:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <configuration>
 5 |   <!-- enable trash -->
 6 |   <property>
 7 |     <name>fs.trash.interval</name>
 8 |     <value>1440</value>
 9 |   </property>
10 | 
11 |   <!-- lzo setting -->
12 |   <property>
13 |     <name>io.compression.codecs</name>
14 |     <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,
15 | org.apache.hadoop.io.compress.BZip2Codec,com.hadoop.compression.lzo.LzoCodec,
16 | com.hadoop.compression.lzo.LzopCodec,org.apache.hadoop.io.compress.SnappyCodec</value>
17 |   </property>
18 |   <property>
19 |     <name>io.compression.codec.lzo.class</name>
20 |     <value>com.hadoop.compression.lzo.LzoCodec</value>
21 |   </property>
22 | 
23 |   <!-- oozie user setting -->
24 |   <property>
25 |     <name>hadoop.proxyuser.oozie.hosts</name>
26 |     <value>*</value>
27 |   </property>
28 |   <property>
29 |     <name>hadoop.proxyuser.oozie.groups</name>
30 |     <value>*</value>
31 |   </property>
32 | 
33 |   <!-- HTTPFS proxy user setting -->
34 |   <property>
35 |     <name>hadoop.proxyuser.httpfs.hosts</name>
36 |     <value>*</value>
37 |   </property>
38 |   <property>
39 |     <name>hadoop.proxyuser.httpfs.groups</name>
40 |     <value>*</value>
41 |   </property>
42 | 
43 |   <property>
44 |     <name>fs.s3n.multipart.uploads.enabled</name>
45 |     <value>true</value>
46 |   </property>
47 | 
48 | </configuration>
49 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cdh5/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: add Cloudera CDH5 key
 2 |   apt_key: url=http://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh/archive.key
 3 |   tags:
 4 |     - cdh5
 5 | 
 6 | - name: add Cloudera CDH5 repository
 7 |   apt_repository: repo='deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh trusty-cdh5 contrib'
 8 |   tags:
 9 |     - cdh5
10 | 
11 | - name: add Cloudera CDH5 GPL extras repository
12 |   apt_repository: repo='deb [arch=amd64] http://archive.cloudera.com/gplextras5/ubuntu/trusty/amd64/gplextras trusty-gplextras5 contrib'
13 |   tags:
14 |     - cdh5
15 | 
16 | - name: install hadoop base pkgs
17 |   apt: name={{ item }} state=present
18 |   with_items:
19 |   - hadoop
20 |   - hadoop-hdfs
21 |   - hadoop-client
22 |   - hadoop-lzo
23 |   tags:
24 |   - cdh5
25 | 
26 | - name: create /etc/hadoop/conf.cluster
27 |   file: path=/etc/hadoop/conf.cluster state=directory owner=root group=root mode=0755
28 |   register: create_hadoop_conf
29 |   tags:
30 |   - cdh5
31 | 
32 | - name: create alternatives for hadoop-conf
33 |   shell: update-alternatives --install /etc/hadoop/conf hadoop-conf {{ item }} 50
34 |   with_items:
35 |   - /etc/hadoop/conf.cluster
36 |   when: create_hadoop_conf|changed
37 |   tags:
38 |   - cdh5
39 | 
40 | - name: create alternatives for hadoop-conf
41 |   shell: update-alternatives --set hadoop-conf {{ item }}
42 |   with_items:
43 |   - /etc/hadoop/conf.cluster
44 |   when: create_hadoop_conf|changed
45 |   tags:
46 |   - cdh5
47 | 
48 | - name: copy the hadoop configuration files
49 |   template: src=hadoop/{{ item }}.j2 dest=/etc/hadoop/conf.cluster/{{ item }} owner=hdfs group=hadoop mode=0664
50 |   with_items:
51 |   - core-site.xml
52 |   - log4j.properties
53 |   register: copy_hadoop_conf
54 |   tags:
55 |   - cdh5
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spark-cloud
 2 | Spark-cloud is a set of scripts for starting spark clusters on ec2
 3 | 
 4 | [![Join the chat at https://gitter.im/entropyltd/spark-cloud](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/entropyltd/spark-cloud?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 5 | 
 6 | [![Code Health](https://landscape.io/github/entropyltd/spark-cloud/master/landscape.svg?style=flat)](https://landscape.io/github/entropyltd/spark-cloud/master)
 7 | 
 8 | # Warning
 9 | spark-cloud is Alpha quality pre-relase software, you are using it at your own risk.
10 | Always check that your clusters have properly started/stopped.
11 | 
12 | spark-cloud will currently only work in us-east AWS zone, support for other zones coming very soon!
13 | 
14 | # Cluster Security
15 | Spark-cloud relies on ip level security for access to web UIs, you should specify `--authorized-address=your.ip.address/32` when running the real cluster.
16 | 
17 | At the moment the auto-scaling group will start with 2 slaves, the minimum will be 2 slaves and the maximum 8.
18 | 
19 | # Example usage
20 | 
21 | ## To launch a cluster into VPC
22 | 
23 | ```
24 | # set credentials
25 | export AWS_ACCESS_KEY=..
26 | export AWS_SECRET_ACCESS_KEY=...
27 | # start cluster
28 | ./spark-cloud.py -k keypair --vpc-id=vpc-XXXXX --subnet-id=subnet-XXXXXX --zone=us-east-1a launch sparkcluster1
29 | ```
30 | 
31 | ## To launch a cluster into EC2-classic
32 | ```
33 | # set credentials
34 | export AWS_ACCESS_KEY=..
35 | export AWS_SECRET_ACCESS_KEY=...
36 | # start cluster
37 | ./spark-cloud.py -k keypair --zone=us-east-1e launch spark-ec2classic
38 | ```
39 | 
40 | ## To ssh into your cluster and run the spark shell
41 | 
42 | To ssh in
43 | 
44 | ```
45 | ssh -i path-to-keypair.pem ubuntu@master-host-which-is-helpfully-printed-at-launch
46 | ```
47 | 
48 | To run `spark-shell` you can't use `--master yarn-client`.
49 | The master URL will be of the form `spark://host:port` it can be found by opening up the spark UI (which is helpfully printed at launch time).
50 | 
51 | 
52 | To run spark-shell just:
53 | ```
54 | spark-shell
55 | ```
56 | 
57 | # Termination
58 | 
59 | Has a couple of issues in case it does not work just rerun the "destroy" command.
60 | 


--------------------------------------------------------------------------------
/tests/test-spark-submit-works-simple-jar.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | key_pair_path=$1
  6 | key_pair=$2
  7 | 
  8 | cluster_name=spark-ec2classic-test
  9 | jar_name=test-spark-app.jar
 10 | local_jar_path=test-spark-app/target/scala-2.10/spark-cluster-launch-test_2.10-0.1-SNAPSHOT.jar
 11 | test_log_path=/tmp/spark-cloud-test.log
 12 | 
 13 | user=ubuntu
 14 | home_dir=/home/${user}
 15 | 
 16 | job_result=job-result.txt
 17 | 
 18 | working_directory=`pwd`
 19 | 
 20 | correct_value="-1696934592"
 21 | 
 22 | ssh_args="-o StrictHostKeyChecking=no -i ${key_pair_path}"
 23 | 
 24 | script_path="../spark-cloud.py"
 25 | 
 26 | if [ "${key_pair_path}" = "" ]; then
 27 |     echo "ERROR: please supply the key pair path as the first arg to this script"
 28 |     exit 1
 29 | fi
 30 | 
 31 | if [ "${key_pair}" = "" ]; then
 32 |     echo "ERROR: please supply the key pair as the second arg to this script"
 33 |     exit 1
 34 | fi
 35 | 
 36 | # TODO take a look at s3 config or aws config files to set automatically
 37 | 
 38 | if [ "${AWS_SECRET_ACCESS_KEY}" = "" ]; then
 39 |     echo "ERROR: AWS_SECRET_ACCESS_KEY not set"
 40 |     exit 1
 41 | fi
 42 | 
 43 | if [ "${AWS_ACCESS_KEY}" = "" ]; then
 44 |     echo "ERROR: AWS_ACCESS_KEY not set"
 45 |     exit 1
 46 | fi
 47 | 
 48 | function extract-master-node-from-log {
 49 |     master=`cat ${test_log_path} | grep -o "ec2.*compute[\-]*[0-9]*\.amazonaws\.com" | head -1`
 50 |     if [ "${master}" = "" ]; then
 51 |         echo "ERROR: Did not find master node"
 52 |         exit 1
 53 |     fi
 54 |     echo ${master}
 55 | }
 56 | 
 57 | function create-cluster {
 58 |     ${script_path} -k ${key_pair} --zone=us-east-1e --max-spot-price=0.02 --min-instances=1 --max-instances=3 launch ${cluster_name} | tee ${test_log_path}
 59 | }
 60 | 
 61 | function build-simple-spark-app {
 62 |     cd ${working_directory}/test-spark-app
 63 |     sbt package
 64 |     cd ${working_directory}
 65 | }
 66 | 
 67 | function spark-submit-simple-app {
 68 |     echo "INFO: Copying jar"
 69 |     scp ${ssh_args} ${local_jar_path} ${user}@$1:${home_dir}/${jar_name}
 70 | 
 71 |     echo "INFO: Getting spark master URL"
 72 |     host=`ssh ${ssh_args} ${user}@$1 "hostname"`
 73 |     spark_master=spark://${host}.ec2.internal:7077
 74 | 
 75 |     echo "INFO: Spark master url: $spark_master"
 76 | 
 77 |     echo "INFO: Running spark-submit"
 78 |     ssh ${ssh_args} ${user}@$1 "spark-submit --master $spark_master --class test.app.AddLotsOfNumbers ${jar_name} ${home_dir}/${job_result}"
 79 | }
 80 | 
 81 | # TODO Add more tests, like:
 82 | # to force it to scale
 83 | # curl the UI, etc
 84 | function check-output-of-job {
 85 |     scp ${ssh_args} ${user}@$1:${home_dir}/${job_result} /tmp/
 86 |     value=`cat /tmp/${job_result}`
 87 | 
 88 |     if [ "${value}" = "${correct_value}" ]; then
 89 |         echo "INFO: Test passed!!!"
 90 |     else
 91 |         echo "ERROR: Test failed, expected ${correct_value} but got ${value}"
 92 |         exit 1
 93 |     fi
 94 | }
 95 | 
 96 | function destroy-cluster {
 97 |     echo "INFO: Destroying cluster"
 98 |     ${script_path} destroy spark-ec2classic-test
 99 | }
100 | 
101 | trap destroy-cluster EXIT
102 | 
103 | create-cluster
104 | master=`extract-master-node-from-log`
105 | 
106 | echo "INFO: master: $master"
107 | 
108 | build-simple-spark-app
109 | 
110 | spark-submit-simple-app ${master}
111 | 
112 | check-output-of-job ${master}
113 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/spark/templates/spark-master.j2:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | # Starts a Spark master
 19 | #
 20 | # chkconfig: 2345 86 14
 21 | # description: Spark master
 22 | #
 23 | ### BEGIN INIT INFO
 24 | # Provides:          spark-master
 25 | # Short-Description: Spark master
 26 | # Default-Start:     2 3 4 5
 27 | # Default-Stop:      0 1 6
 28 | # Required-Start:    $syslog $remote_fs
 29 | # Required-Stop:     $syslog $remote_fs
 30 | # Should-Start:
 31 | # Should-Stop:
 32 | ### END INIT INFO
 33 | 
 34 | CONF_DIR="/etc/spark/conf"
 35 | if [ -f $CONF_DIR/spark-env.sh ]; then
 36 |     . $CONF_DIR/spark-env.sh
 37 | fi
 38 | echo "spark://${STANDALONE_SPARK_MASTER_HOST}:7077" > /etc/spark/conf/cluster-url
 39 | if [ ! -z "$SPARK_MASTER" ]; then
 40 |   echo "Not a master, not starting"
 41 |   exit 0
 42 | fi
 43 | 
 44 | . /lib/lsb/init-functions
 45 | BIGTOP_DEFAULTS_DIR=${BIGTOP_DEFAULTS_DIR-/etc/default}
 46 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/hadoop ] && . ${BIGTOP_DEFAULTS_DIR}/hadoop
 47 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/spark-master ] && . ${BIGTOP_DEFAULTS_DIR}/spark-master
 48 | 
 49 | # Autodetect JAVA_HOME if not defined
 50 | . /usr/lib/bigtop-utils/bigtop-detect-javahome
 51 | 
 52 | RETVAL_SUCCESS=0
 53 | 
 54 | STATUS_RUNNING=0
 55 | STATUS_DEAD=1
 56 | STATUS_DEAD_AND_LOCK=2
 57 | STATUS_NOT_RUNNING=3
 58 | STATUS_OTHER_ERROR=102
 59 | 
 60 | 
 61 | ERROR_PROGRAM_NOT_INSTALLED=5
 62 | ERROR_PROGRAM_NOT_CONFIGURED=6
 63 | 
 64 | 
 65 | RETVAL=0
 66 | SLEEP_TIME=5
 67 | PROC_NAME="java"
 68 | 
 69 | DAEMON="spark-master"
 70 | DESC="Spark master"
 71 | EXEC_PATH="/usr/lib/spark/bin/spark-class"
 72 | EXEC_DIR=""
 73 | SVC_USER="spark"
 74 | DAEMON_FLAGS=""
 75 | CONF_DIR="/etc/spark/conf"
 76 | PIDFILE="/var/run/spark/spark-master.pid"
 77 | LOCKDIR="/var/lock/subsys"
 78 | LOCKFILE="$LOCKDIR/spark-master"
 79 | WORKING_DIR="/var/lib/spark"
 80 | 
 81 | install -d -m 0755 -o spark -g spark /var/run/spark 1>/dev/null 2>&1 || :
 82 | [ -d "$LOCKDIR" ] || install -d -m 0755 $LOCKDIR 1>/dev/null 2>&1 || :
 83 | start() {
 84 |     [ -x $EXE_FILE ] || exit $ERROR_PROGRAM_NOT_INSTALLED
 85 |     log_success_msg "Starting $DESC (${DAEMON}): "
 86 | 
 87 |     checkstatusofproc
 88 |     status=$?
 89 |     if [ "$status" -eq "$STATUS_RUNNING" ]; then
 90 |         log_success_msg "${DESC} is running"
 91 |         exit 0
 92 |     fi
 93 | 
 94 |     LOG_FILE=/var/log/spark/${DAEMON}.out
 95 | 
 96 |     su -s /bin/bash $SVC_USER -c "nohup nice -n 0 \
 97 |         ${EXEC_PATH} org.apache.spark.deploy.master.Master $DAEMON_FLAGS \
 98 |         > $LOG_FILE 2>&1 & "'echo $!' > "$PIDFILE"
 99 | 
100 |     sleep 3
101 | 
102 |     checkstatusofproc
103 |     RETVAL=$?
104 |     [ $RETVAL -eq $STATUS_RUNNING ] && touch $LOCKFILE
105 |     return $RETVAL
106 | }
107 | stop() {
108 |     log_success_msg "Stopping $DESC (${DAEMON}): "
109 |     killproc -p $PIDFILE java
110 |     RETVAL=$?
111 | 
112 |     [ $RETVAL -eq $RETVAL_SUCCESS ] && rm -f $LOCKFILE $PIDFILE
113 |     return $RETVAL
114 | }
115 | restart() {
116 |   stop
117 |   start
118 | }
119 | 
120 | checkstatusofproc(){
121 |   pidofproc -p $PIDFILE $PROC_NAME > /dev/null
122 | }
123 | 
124 | checkstatus(){
125 |   checkstatusofproc
126 |   status=$?
127 | 
128 |   case "$status" in
129 |     $STATUS_RUNNING)
130 |       log_success_msg "${DESC} is running"
131 |       ;;
132 |     $STATUS_DEAD)
133 |       log_failure_msg "${DESC} is dead and pid file exists"
134 |       ;;
135 |     $STATUS_DEAD_AND_LOCK)
136 |       log_failure_msg "${DESC} is dead and lock file exists"
137 |       ;;
138 |     $STATUS_NOT_RUNNING)
139 |       log_failure_msg "${DESC} is not running"
140 |       ;;
141 |     *)
142 |       log_failure_msg "${DESC} status is unknown"
143 |       ;;
144 |   esac
145 |   return $status
146 | }
147 | 
148 | condrestart(){
149 |   [ -e $LOCKFILE ] && restart || :
150 | }
151 | 
152 | check_for_root() {
153 |   if [ $(id -ur) -ne 0 ]; then
154 |     echo 'Error: root user required'
155 |     echo
156 |     exit 1
157 |   fi
158 | }
159 | 
160 | service() {
161 |   case "$1" in
162 |     start)
163 |       check_for_root
164 |       start
165 |       ;;
166 |     stop)
167 |       check_for_root
168 |       stop
169 |       ;;
170 |     status)
171 |       checkstatus
172 |       RETVAL=$?
173 |       ;;
174 |     restart)
175 |       check_for_root
176 |       restart
177 |       ;;
178 |     condrestart|try-restart)
179 |       check_for_root
180 |       condrestart
181 |       ;;
182 |     *)
183 |       echo $"Usage: $0 {start|stop|status|restart|try-restart|condrestart}"
184 |       exit 1
185 |   esac
186 | }
187 | 
188 | service "$@"
189 | 
190 | exit $RETVAL
191 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/spark/templates/spark-worker.j2:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | # Starts a Spark worker
 19 | #
 20 | # chkconfig: 2345 87 13
 21 | # description: Spark worker
 22 | #
 23 | ### BEGIN INIT INFO
 24 | # Provides:          spark-worker
 25 | # Short-Description: Spark worker
 26 | # Default-Start:     2 3 4 5
 27 | # Default-Stop:      0 1 6
 28 | # Required-Start:    $syslog $remote_fs
 29 | # Required-Stop:     $syslog $remote_fs
 30 | # Should-Start:
 31 | # Should-Stop:
 32 | ### END INIT INFO
 33 | 
 34 | CONF_DIR="/etc/spark/conf"
 35 | if [ -f $CONF_DIR/spark-env.sh ]; then
 36 |     . $CONF_DIR/spark-env.sh
 37 | fi
 38 | echo "spark://${STANDALONE_SPARK_MASTER_HOST}:7077" > /etc/spark/conf/cluster-url
 39 | if [ -z "$SPARK_MASTER" ]; then
 40 |      echo "I'm a master, worker not starting"
 41 |      exit 0
 42 | fi
 43 | 
 44 | . /lib/lsb/init-functions
 45 | BIGTOP_DEFAULTS_DIR=${BIGTOP_DEFAULTS_DIR-/etc/default}
 46 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/hadoop ] && . ${BIGTOP_DEFAULTS_DIR}/hadoop
 47 | [ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/spark-worker ] && . ${BIGTOP_DEFAULTS_DIR}/spark-worker
 48 | 
 49 | # Autodetect JAVA_HOME if not defined
 50 | . /usr/lib/bigtop-utils/bigtop-detect-javahome
 51 | 
 52 | RETVAL_SUCCESS=0
 53 | 
 54 | STATUS_RUNNING=0
 55 | STATUS_DEAD=1
 56 | STATUS_DEAD_AND_LOCK=2
 57 | STATUS_NOT_RUNNING=3
 58 | STATUS_OTHER_ERROR=102
 59 | 
 60 | 
 61 | ERROR_PROGRAM_NOT_INSTALLED=5
 62 | ERROR_PROGRAM_NOT_CONFIGURED=6
 63 | 
 64 | 
 65 | RETVAL=0
 66 | SLEEP_TIME=5
 67 | PROC_NAME="java"
 68 | 
 69 | DAEMON="spark-worker"
 70 | DESC="Spark worker"
 71 | EXEC_PATH="/usr/lib/spark/bin/spark-class"
 72 | EXEC_DIR=""
 73 | SVC_USER="spark"
 74 | DAEMON_FLAGS=""
 75 | CONF_DIR="/etc/spark/conf"
 76 | PIDFILE="/var/run/spark/spark-worker.pid"
 77 | LOCKDIR="/var/lock/subsys"
 78 | LOCKFILE="$LOCKDIR/spark-worker"
 79 | WORKING_DIR="/var/lib/spark"
 80 | 
 81 | install -d -m 0755 -o spark -g spark /var/run/spark 1>/dev/null 2>&1 || :
 82 | [ -d "$LOCKDIR" ] || install -d -m 0755 $LOCKDIR 1>/dev/null 2>&1 || :
 83 | start() {
 84 |     [ -x $EXE_FILE ] || exit $ERROR_PROGRAM_NOT_INSTALLED
 85 |     log_success_msg "Starting $DESC (${DAEMON}): "
 86 | 
 87 |     checkstatusofproc
 88 |     status=$?
 89 |     if [ "$status" -eq "$STATUS_RUNNING" ]; then
 90 |         log_success_msg "${DESC} is running"
 91 |         exit 0
 92 |     fi
 93 | 
 94 |     LOG_FILE=/var/log/spark/${DAEMON}.out
 95 | 
 96 |     if [ -f $CONF_DIR/spark-env.sh ]; then
 97 |         . $CONF_DIR/spark-env.sh
 98 |     fi
 99 | 
100 |     su -s /bin/bash $SVC_USER -c "nohup nice -n 0 \
101 |         ${EXEC_PATH} org.apache.spark.deploy.worker.Worker spark://$STANDALONE_SPARK_MASTER_HOST:$SPARK_MASTER_PORT $DAEMON_FLAGS \
102 |         > $LOG_FILE 2>&1 & "'echo $!' > "$PIDFILE"
103 | 
104 |     sleep 3
105 | 
106 |     checkstatusofproc
107 |     RETVAL=$?
108 |     [ $RETVAL -eq $STATUS_RUNNING ] && touch $LOCKFILE
109 |     return $RETVAL
110 | }
111 | stop() {
112 |     log_success_msg "Stopping $DESC (${DAEMON}): "
113 |     killproc -p $PIDFILE java
114 |     RETVAL=$?
115 | 
116 |     [ $RETVAL -eq $RETVAL_SUCCESS ] && rm -f $LOCKFILE $PIDFILE
117 |     return $RETVAL
118 | }
119 | restart() {
120 |   stop
121 |   start
122 | }
123 | 
124 | checkstatusofproc(){
125 |   pidofproc -p $PIDFILE $PROC_NAME > /dev/null
126 | }
127 | 
128 | checkstatus(){
129 |   checkstatusofproc
130 |   status=$?
131 | 
132 |   case "$status" in
133 |     $STATUS_RUNNING)
134 |       log_success_msg "${DESC} is running"
135 |       ;;
136 |     $STATUS_DEAD)
137 |       log_failure_msg "${DESC} is dead and pid file exists"
138 |       ;;
139 |     $STATUS_DEAD_AND_LOCK)
140 |       log_failure_msg "${DESC} is dead and lock file exists"
141 |       ;;
142 |     $STATUS_NOT_RUNNING)
143 |       log_failure_msg "${DESC} is not running"
144 |       ;;
145 |     *)
146 |       log_failure_msg "${DESC} status is unknown"
147 |       ;;
148 |   esac
149 |   return $status
150 | }
151 | 
152 | condrestart(){
153 |   [ -e $LOCKFILE ] && restart || :
154 | }
155 | 
156 | check_for_root() {
157 |   if [ $(id -ur) -ne 0 ]; then
158 |     echo 'Error: root user required'
159 |     echo
160 |     exit 1
161 |   fi
162 | }
163 | 
164 | service() {
165 |   case "$1" in
166 |     start)
167 |       check_for_root
168 |       start
169 |       ;;
170 |     stop)
171 |       check_for_root
172 |       stop
173 |       ;;
174 |     status)
175 |       checkstatus
176 |       RETVAL=$?
177 |       ;;
178 |     restart)
179 |       check_for_root
180 |       restart
181 |       ;;
182 |     condrestart|try-restart)
183 |       check_for_root
184 |       condrestart
185 |       ;;
186 |     *)
187 |       echo $"Usage: $0 {start|stop|status|restart|try-restart|condrestart}"
188 |       exit 1
189 |   esac
190 | }
191 | 
192 | service "$@"
193 | 
194 | exit $RETVAL
195 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/spark/templates/spark-env.sh.j2:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # This file is sourced when running various Spark programs.
  4 | # Copy it as spark-env.sh and edit that to configure Spark for your site.
  5 | 
  6 | # Options read when launching programs locally with
  7 | # ./bin/run-example or ./bin/spark-submit
  8 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
  9 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
 10 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
 11 | # - SPARK_CLASSPATH, default classpath entries to append
 12 | 
 13 | # Options read by executors and drivers running inside the cluster
 14 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
 15 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
 16 | # - SPARK_CLASSPATH, default classpath entries to append
 17 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
 18 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
 19 | 
 20 | # Options read in YARN client mode
 21 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
 22 | # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
 23 | # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
 24 | # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
 25 | # - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 1G)
 26 | # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
 27 | # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
 28 | # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
 29 | # - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
 30 | 
 31 | # Options for the daemons used in the standalone deploy mode
 32 | # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
 33 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
 34 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
 35 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine
 36 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
 37 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
 38 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
 39 | # - SPARK_WORKER_DIR, to set the working directory of worker processes
 40 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
 41 | # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
 42 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
 43 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
 44 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
 45 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
 46 | 
 47 | # Generic options for the daemons used in the standalone deploy mode
 48 | # - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
 49 | # - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
 50 | # - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
 51 | # - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
 52 | # - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
 53 | 
 54 | ###
 55 | ### === IMPORTANT ===
 56 | ### Change the following to specify a real cluster's Master host
 57 | ###
 58 | TMP=`tempfile`
 59 | curl -f -s http://169.254.169.254/latest/user-data > $TMP || true
 60 | chmod a+rx $TMP
 61 | . $TMP
 62 | rm -f $TMP
 63 | PUBLIC_HOSTNAME=`curl -s http://169.254.169.254/latest/meta-data/hostname`
 64 | 
 65 | if [ -z "$SPARK_MASTER" ]
 66 | then
 67 |         export STANDALONE_SPARK_MASTER_HOST=$PUBLIC_HOSTNAME
 68 | else
 69 |         export STANDALONE_SPARK_MASTER_HOST=$SPARK_MASTER
 70 | fi
 71 | 
 72 | export SPARK_MASTER_IP=$STANDALONE_SPARK_MASTER_HOST
 73 | export SPARK_MASTER_HOST=$STANDALONE_SPARK_MASTER_HOST
 74 | export SPARK_LOCAL_IP=$PUBLIC_HOSTNAME
 75 | export SPARK_PUBLIC_DNS=$PUBLIC_HOSTNAME
 76 | 
 77 | ### Let's run everything with JVM runtime, instead of Scala
 78 | export SPARK_LAUNCH_WITH_SCALA=0
 79 | export SPARK_LIBRARY_PATH=${SPARK_HOME}/lib
 80 | export SPARK_MASTER_WEBUI_PORT=18080
 81 | export SPARK_MASTER_PORT=7077
 82 | export SPARK_WORKER_PORT=7078
 83 | export SPARK_WORKER_WEBUI_PORT=18081
 84 | export SPARK_WORKER_DIR=/var/run/spark/work
 85 | export SPARK_LOG_DIR=/var/log/spark
 86 | export SPARK_PID_DIR='/var/run/spark/'
 87 | 
 88 | if [ -n "$HADOOP_HOME" ]; then
 89 |   export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/libfakeroot:/usr/lib64/libfakeroot:/usr/lib32/libfakeroot:/usr/lib/hadoop/lib/native
 90 | fi
 91 | 
 92 | if [ -d "/usr/lib/hadoop/lib/native" ]; then
 93 | 	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/hadoop/lib/native
 94 | fi
 95 | 
 96 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
 97 | 
 98 | if [[ -d $SPARK_HOME/python ]]
 99 | then
100 |     for i in 
101 |     do
102 |         SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:$i
103 |     done
104 | fi
105 | 
106 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:$SPARK_LIBRARY_PATH/spark-assembly.jar"
107 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:"
108 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop/lib/*"
109 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop/*"
110 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-hdfs/lib/*"
111 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-hdfs/*"
112 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-mapreduce/lib/*"
113 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-mapreduce/*"
114 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-yarn/lib/*"
115 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hadoop-yarn/*"
116 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/hive/lib/*"
117 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/flume-ng/lib/*"
118 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/paquet/lib/*"
119 | SPARK_DIST_CLASSPATH="$SPARK_DIST_CLASSPATH:/usr/lib/avro/lib/*"
120 | 


--------------------------------------------------------------------------------
/autoscaling-demo.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Autoscaling Demo
 3 | 
 4 | (This demo assumes default puny instance size)
 5 | 
 6 | Once in the shell try running some dummy code to consume the resources, e.g.
 7 | 
 8 | ```
 9 | sc.makeRDD(1 to 1000000).repartition(400).map(_ => (1 to 20000).map(_.toString.length).reduce(_ + _)).reduce(_ + _)
10 | ```
11 | 
12 | You should observe the rate of log entries is around 2 - 5 seconds, e.g.
13 | 
14 | ```
15 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Starting task 15.0 in stage 1.0 (TID 18, 172.31.60.214, partition 15,PROCESS_LOCAL, 2170 bytes)
16 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Finished task 12.0 in stage 1.0 (TID 15) in 5668 ms on 172.31.60.214 (13/400)
17 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Starting task 16.0 in stage 1.0 (TID 19, 172.31.51.194, partition 16,PROCESS_LOCAL, 2170 bytes)
18 | 15/12/03 14:01:55 INFO scheduler.TaskSetManager: Finished task 13.0 in stage 1.0 (TID 16) in 5627 ms on 172.31.51.194 (14/400)
19 | 15/12/03 14:01:56 INFO scheduler.TaskSetManager: Starting task 17.0 in stage 1.0 (TID 20, 172.31.54.237, partition 17,PROCESS_LOCAL, 2170 bytes)
20 | 15/12/03 14:01:56 INFO scheduler.TaskSetManager: Finished task 14.0 in stage 1.0 (TID 17) in 5621 ms on 172.31.54.237 (15/400)
21 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Starting task 18.0 in stage 1.0 (TID 21, 172.31.60.214, partition 18,PROCESS_LOCAL, 2170 bytes)
22 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Finished task 15.0 in stage 1.0 (TID 18) in 5665 ms on 172.31.60.214 (16/400)
23 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Starting task 19.0 in stage 1.0 (TID 22, 172.31.51.194, partition 19,PROCESS_LOCAL, 2170 bytes)
24 | 15/12/03 14:02:01 INFO scheduler.TaskSetManager: Finished task 16.0 in stage 1.0 (TID 19) in 5621 ms on 172.31.51.194 (17/400)
25 | ```
26 | 
27 | After around 50 tasks you should see some nodes spinning up in the ec2 dashboard (https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Instances:securityGroupName=spark-ec2classic-slaves;sort=instanceId)
28 | 
29 | After around 120 tasks you should see some messages about adding executors:
30 | 
31 | ```
32 | 15/12/03 14:05:17 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/3 on worker-20151203140515-172.31.61.144-7078 (172.31.61.144:7078) with 1 cores
33 | 15/12/03 14:05:17 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/3 on hostPort 172.31.61.144:7078 with 1 cores, 1024.0 MB RAM
34 | 15/12/03 14:05:17 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/3 is now RUNNING
35 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Starting task 123.0 in stage 1.0 (TID 126, 172.31.51.194, partition 123,PROCESS_LOCAL, 2170 bytes)
36 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Finished task 120.0 in stage 1.0 (TID 123) in 5628 ms on 172.31.51.194 (121/400)
37 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Starting task 124.0 in stage 1.0 (TID 127, 172.31.54.237, partition 124,PROCESS_LOCAL, 2170 bytes)
38 | 15/12/03 14:05:18 INFO scheduler.TaskSetManager: Finished task 121.0 in stage 1.0 (TID 124) in 5567 ms on 172.31.54.237 (122/400)
39 | 15/12/03 14:05:18 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/3 is now LOADING
40 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/4 on worker-20151203140520-172.31.61.145-7078 (172.31.61.145:7078) with 1 cores
41 | 15/12/03 14:05:22 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/4 on hostPort 172.31.61.145:7078 with 1 cores, 1024.0 MB RAM
42 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/4 is now RUNNING
43 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/4 is now LOADING
44 | 15/12/03 14:05:22 INFO scheduler.TaskSetManager: Starting task 125.0 in stage 1.0 (TID 128, 172.31.60.214, partition 125,PROCESS_LOCAL, 2170 bytes)
45 | 15/12/03 14:05:22 INFO scheduler.TaskSetManager: Finished task 122.0 in stage 1.0 (TID 125) in 6466 ms on 172.31.60.214 (123/400)
46 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor added: app-20151203140010-0003/5 on worker-20151203140521-172.31.61.146-7078 (172.31.61.146:7078) with 1 cores
47 | 15/12/03 14:05:22 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20151203140010-0003/5 on hostPort 172.31.61.146:7078 with 1 cores, 1024.0 MB RAM
48 | 15/12/03 14:05:22 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/5 is now RUNNING
49 | 15/12/03 14:05:23 INFO client.AppClient$ClientEndpoint: Executor updated: app-20151203140010-0003/5 is now LOADING
50 | ```
51 | 
52 | Then you should start to see more distinct IP addresses in the log entires and the log entries should be printed at a perceivably faster rate (every second). Your job is now using more nodes!
53 | 
54 | ```
55 | 15/12/03 14:07:36 INFO scheduler.TaskSetManager: Starting task 302.0 in stage 1.0 (TID 305, 172.31.60.214, partition 302,PROCESS_LOCAL, 2170 bytes)
56 | 15/12/03 14:07:36 INFO scheduler.TaskSetManager: Finished task 294.0 in stage 1.0 (TID 297) in 5641 ms on 172.31.60.214 (295/400)
57 | 15/12/03 14:07:37 INFO scheduler.TaskSetManager: Starting task 303.0 in stage 1.0 (TID 306, 172.31.61.142, partition 303,PROCESS_LOCAL, 2170 bytes)
58 | 15/12/03 14:07:37 INFO scheduler.TaskSetManager: Finished task 295.0 in stage 1.0 (TID 298) in 5561 ms on 172.31.61.142 (296/400)
59 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 304.0 in stage 1.0 (TID 307, 172.31.51.194, partition 304,PROCESS_LOCAL, 2170 bytes)
60 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 296.0 in stage 1.0 (TID 299) in 5744 ms on 172.31.51.194 (297/400)
61 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 305.0 in stage 1.0 (TID 308, 172.31.54.237, partition 305,PROCESS_LOCAL, 2170 bytes)
62 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 297.0 in stage 1.0 (TID 300) in 5611 ms on 172.31.54.237 (298/400)
63 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Starting task 306.0 in stage 1.0 (TID 309, 172.31.61.144, partition 306,PROCESS_LOCAL, 2170 bytes)
64 | 15/12/03 14:07:38 INFO scheduler.TaskSetManager: Finished task 298.0 in stage 1.0 (TID 301) in 5790 ms on 172.31.61.144 (299/400)
65 | 15/12/03 14:07:40 INFO scheduler.TaskSetManager: Starting task 307.0 in stage 1.0 (TID 310, 172.31.61.143, partition 307,PROCESS_LOCAL, 2170 bytes)
66 | 15/12/03 14:07:40 INFO scheduler.TaskSetManager: Finished task 299.0 in stage 1.0 (TID 302) in 5749 ms on 172.31.61.143 (300/400)
67 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Starting task 308.0 in stage 1.0 (TID 311, 172.31.61.145, partition 308,PROCESS_LOCAL, 2170 bytes)
68 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Finished task 301.0 in stage 1.0 (TID 304) in 5402 ms on 172.31.61.145 (301/400)
69 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Starting task 309.0 in stage 1.0 (TID 312, 172.31.61.146, partition 309,PROCESS_LOCAL, 2170 bytes)
70 | 15/12/03 14:07:41 INFO scheduler.TaskSetManager: Finished task 300.0 in stage 1.0 (TID 303) in 5634 ms on 172.31.61.146 (302/400)
71 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Starting task 310.0 in stage 1.0 (TID 313, 172.31.60.214, partition 310,PROCESS_LOCAL, 2170 bytes)
72 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Finished task 302.0 in stage 1.0 (TID 305) in 5676 ms on 172.31.60.214 (303/400)
73 | 15/12/03 14:07:42 INFO scheduler.TaskSetManager: Starting task 311.0 in stage 1.0 (TID 314, 172.31.61.142, partition 311,PROCESS_LOCAL, 2170 bytes)
74 | ```
75 | 
76 | A minute or two later and it finsihes
77 | 
78 | ```
79 | 15/12/03 14:08:49 INFO scheduler.DAGScheduler: Job 0 finished: reduce at <console>:16, took 452.183523 s
80 | res0: Int = -1300313216
81 | ```
82 | 
83 | Now try running the exact same command again (promptly after the other command), you will see the job end faster since the whole job will be running with more resources rather than just part of it:
84 | 
85 | ```
86 | 15/12/03 14:16:57 INFO scheduler.DAGScheduler: Job 1 finished: reduce at <console>:16, took 282.684993 s
87 | res1: Int = -1300313216
88 | ```
89 | 
90 | Finally, go make a cup of tea or something and when you return you will see that some of the instances have been terminated, wait for a few more minutes and it will be back to just 2.
91 | 
92 | http://i.imgur.com/fBLF2EN.gif
93 | 
94 | 


--------------------------------------------------------------------------------
/ami-packer/ansible/roles/cdh5/templates/hadoop/log4j.properties.j2:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 The Apache Software Foundation
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | 
 19 | # Define some default values that can be overridden by system properties
 20 | hadoop.root.logger=INFO,console
 21 | hadoop.log.dir=.
 22 | hadoop.log.file=hadoop.log
 23 | 
 24 | # Define the root logger to the system property "hadoop.root.logger".
 25 | log4j.rootLogger=${hadoop.root.logger}, EventCounter
 26 | 
 27 | # Logging Threshold
 28 | log4j.threshold=ALL
 29 | 
 30 | # Null Appender
 31 | log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender
 32 | 
 33 | #
 34 | # Rolling File Appender - cap space usage at 5gb.
 35 | #
 36 | hadoop.log.maxfilesize=256MB
 37 | hadoop.log.maxbackupindex=20
 38 | log4j.appender.RFA=org.apache.log4j.RollingFileAppender
 39 | log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
 40 | 
 41 | log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize}
 42 | log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex}
 43 | 
 44 | log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
 45 | 
 46 | # Pattern format: Date LogLevel LoggerName LogMessage
 47 | log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
 48 | # Debugging Pattern format
 49 | #log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
 50 | 
 51 | 
 52 | #
 53 | # Daily Rolling File Appender
 54 | #
 55 | 
 56 | log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
 57 | log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
 58 | 
 59 | # Rollver at midnight
 60 | log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
 61 | 
 62 | # 30-day backup
 63 | #log4j.appender.DRFA.MaxBackupIndex=30
 64 | log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
 65 | 
 66 | # Pattern format: Date LogLevel LoggerName LogMessage
 67 | log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
 68 | # Debugging Pattern format
 69 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
 70 | 
 71 | 
 72 | #
 73 | # console
 74 | # Add "console" to rootlogger above if you want to use this
 75 | #
 76 | 
 77 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 78 | log4j.appender.console.target=System.err
 79 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 80 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
 81 | 
 82 | #
 83 | # TaskLog Appender
 84 | #
 85 | 
 86 | #Default values
 87 | hadoop.tasklog.taskid=null
 88 | hadoop.tasklog.iscleanup=false
 89 | hadoop.tasklog.noKeepSplits=4
 90 | hadoop.tasklog.totalLogFileSize=100
 91 | hadoop.tasklog.purgeLogSplits=true
 92 | hadoop.tasklog.logsRetainHours=12
 93 | 
 94 | log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
 95 | log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
 96 | log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
 97 | log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
 98 | 
 99 | log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
100 | log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
101 | 
102 | #
103 | # HDFS block state change log from block manager
104 | #
105 | # Uncomment the following to suppress normal block state change
106 | # messages from BlockManager in NameNode.
107 | #log4j.logger.BlockStateChange=WARN
108 | 
109 | #
110 | #Security appender
111 | #
112 | hadoop.security.logger=INFO,NullAppender
113 | hadoop.security.log.maxfilesize=256MB
114 | hadoop.security.log.maxbackupindex=20
115 | log4j.category.SecurityLogger=${hadoop.security.logger}
116 | hadoop.security.log.file=SecurityAuth-${user.name}.audit
117 | log4j.appender.RFAS=org.apache.log4j.RollingFileAppender
118 | log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
119 | log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout
120 | log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
121 | log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize}
122 | log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex}
123 | 
124 | #
125 | # Daily Rolling Security appender
126 | #
127 | log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
128 | log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
129 | log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
130 | log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
131 | log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd
132 | 
133 | #
134 | # hadoop configuration logging
135 | #
136 | 
137 | # Uncomment the following line to turn off configuration deprecation warnings.
138 | # log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN
139 | 
140 | #
141 | # hdfs audit logging
142 | #
143 | hdfs.audit.logger=INFO,NullAppender
144 | hdfs.audit.log.maxfilesize=256MB
145 | hdfs.audit.log.maxbackupindex=20
146 | log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
147 | log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
148 | log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender
149 | log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log
150 | log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout
151 | log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
152 | log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize}
153 | log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex}
154 | 
155 | #
156 | # mapred audit logging
157 | #
158 | mapred.audit.logger=INFO,NullAppender
159 | mapred.audit.log.maxfilesize=256MB
160 | mapred.audit.log.maxbackupindex=20
161 | log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
162 | log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
163 | log4j.appender.MRAUDIT=org.apache.log4j.RollingFileAppender
164 | log4j.appender.MRAUDIT.File=${hadoop.log.dir}/mapred-audit.log
165 | log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
166 | log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
167 | log4j.appender.MRAUDIT.MaxFileSize=${mapred.audit.log.maxfilesize}
168 | log4j.appender.MRAUDIT.MaxBackupIndex=${mapred.audit.log.maxbackupindex}
169 | 
170 | # Custom Logging levels
171 | 
172 | #log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
173 | #log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
174 | #log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=DEBUG
175 | 
176 | # Jets3t library
177 | log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
178 | 
179 | #
180 | # Event Counter Appender
181 | # Sends counts of logging messages at different severity levels to Hadoop Metrics.
182 | #
183 | log4j.appender.EventCounter=org.apache.hadoop.log.metrics.EventCounter
184 | 
185 | #
186 | # Job Summary Appender
187 | #
188 | # Use following logger to send summary to separate file defined by
189 | # hadoop.mapreduce.jobsummary.log.file :
190 | # hadoop.mapreduce.jobsummary.logger=INFO,JSA
191 | #
192 | hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger}
193 | hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log
194 | hadoop.mapreduce.jobsummary.log.maxfilesize=256MB
195 | hadoop.mapreduce.jobsummary.log.maxbackupindex=20
196 | log4j.appender.JSA=org.apache.log4j.RollingFileAppender
197 | log4j.appender.JSA.File=${hadoop.log.dir}/${hadoop.mapreduce.jobsummary.log.file}
198 | log4j.appender.JSA.MaxFileSize=${hadoop.mapreduce.jobsummary.log.maxfilesize}
199 | log4j.appender.JSA.MaxBackupIndex=${hadoop.mapreduce.jobsummary.log.maxbackupindex}
200 | log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
201 | log4j.appender.JSA.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
202 | log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${hadoop.mapreduce.jobsummary.logger}
203 | log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
204 | 
205 | #
206 | # Yarn ResourceManager Application Summary Log
207 | #
208 | # Set the ResourceManager summary log filename
209 | yarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log
210 | # Set the ResourceManager summary log level and appender
211 | yarn.server.resourcemanager.appsummary.logger=${hadoop.root.logger}
212 | #yarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY
213 | 
214 | # To enable AppSummaryLogging for the RM,
215 | # set yarn.server.resourcemanager.appsummary.logger to
216 | # <LEVEL>,RMSUMMARY in hadoop-env.sh
217 | 
218 | # Appender for ResourceManager Application Summary Log
219 | # Requires the following properties to be set
220 | #    - hadoop.log.dir (Hadoop Log directory)
221 | #    - yarn.server.resourcemanager.appsummary.log.file (resource manager app summary log filename)
222 | #    - yarn.server.resourcemanager.appsummary.logger (resource manager app summary log level and appender)
223 | 
224 | log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger}
225 | log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false
226 | log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender
227 | log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file}
228 | log4j.appender.RMSUMMARY.MaxFileSize=256MB
229 | log4j.appender.RMSUMMARY.MaxBackupIndex=20
230 | log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout
231 | log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
232 | 
233 | # HS audit log configs
234 | #mapreduce.hs.audit.logger=INFO,HSAUDIT
235 | #log4j.logger.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=${mapreduce.hs.audit.logger}
236 | #log4j.additivity.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=false
237 | #log4j.appender.HSAUDIT=org.apache.log4j.DailyRollingFileAppender
238 | #log4j.appender.HSAUDIT.File=${hadoop.log.dir}/hs-audit.log
239 | #log4j.appender.HSAUDIT.layout=org.apache.log4j.PatternLayout
240 | #log4j.appender.HSAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
241 | #log4j.appender.HSAUDIT.DatePattern=.yyyy-MM-dd
242 | 
243 | # Http Server Request Logs
244 | #log4j.logger.http.requests.namenode=INFO,namenoderequestlog
245 | #log4j.appender.namenoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
246 | #log4j.appender.namenoderequestlog.Filename=${hadoop.log.dir}/jetty-namenode-yyyy_mm_dd.log
247 | #log4j.appender.namenoderequestlog.RetainDays=3
248 | 
249 | #log4j.logger.http.requests.datanode=INFO,datanoderequestlog
250 | #log4j.appender.datanoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
251 | #log4j.appender.datanoderequestlog.Filename=${hadoop.log.dir}/jetty-datanode-yyyy_mm_dd.log
252 | #log4j.appender.datanoderequestlog.RetainDays=3
253 | 
254 | #log4j.logger.http.requests.resourcemanager=INFO,resourcemanagerrequestlog
255 | #log4j.appender.resourcemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
256 | #log4j.appender.resourcemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-resourcemanager-yyyy_mm_dd.log
257 | #log4j.appender.resourcemanagerrequestlog.RetainDays=3
258 | 
259 | #log4j.logger.http.requests.jobhistory=INFO,jobhistoryrequestlog
260 | #log4j.appender.jobhistoryrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
261 | #log4j.appender.jobhistoryrequestlog.Filename=${hadoop.log.dir}/jetty-jobhistory-yyyy_mm_dd.log
262 | #log4j.appender.jobhistoryrequestlog.RetainDays=3
263 | 
264 | #log4j.logger.http.requests.nodemanager=INFO,nodemanagerrequestlog
265 | #log4j.appender.nodemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
266 | #log4j.appender.nodemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-nodemanager-yyyy_mm_dd.log
267 | #log4j.appender.nodemanagerrequestlog.RetainDays=3
268 | 


--------------------------------------------------------------------------------
/spark-cloud.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # pylint: disable=line-too-long
  3 | # idea based on https://github.com/apache/spark/blob/master/ec2/spark_ec2.py
  4 | from __future__ import absolute_import, division, print_function, unicode_literals
  5 | import random
  6 | 
  7 | import sys
  8 | import time
  9 | import socket
 10 | from optparse import OptionParser
 11 | from datetime import datetime
 12 | 
 13 | import boto
 14 | import boto.ec2.autoscale
 15 | import boto.ec2.cloudwatch
 16 | from boto.ec2.autoscale import LaunchConfiguration, AutoScalingGroup, ScalingPolicy
 17 | from boto.ec2.cloudwatch import MetricAlarm
 18 | from boto.ec2.networkinterface import NetworkInterfaceSpecification, NetworkInterfaceCollection
 19 | 
 20 | 
 21 | def get_group(conn, name):
 22 |     """
 23 |     Get the EC2 security group of the given name
 24 |     """
 25 |     groups = conn.get_all_security_groups()
 26 |     return [g for g in groups if g.name == name]
 27 | 
 28 | def get_or_make_group(conn, name, vpc_id):
 29 |     """
 30 |         Get the EC2 security group of the given name, creating it if it doesn't exist
 31 |     """
 32 |     groups = conn.get_all_security_groups()
 33 |     group = [g for g in groups if g.name == name]
 34 |     if len(group) > 0:
 35 |         return group[0]
 36 |     else:
 37 |         print("Creating security group " + name)
 38 |         return conn.create_security_group(name, "Spark Cloud group", vpc_id)
 39 | 
 40 | 
 41 | def wait_for_tcp_port(host, port=22):
 42 |     sys.stdout.write(
 43 |         "Waiting for port {port} to be available".format(port=port)
 44 |     )
 45 |     sys.stdout.flush()
 46 |     start_time = datetime.now()
 47 |     while True:
 48 |         sys.stdout.write(".")
 49 |         sys.stdout.flush()
 50 |         s = socket.socket()
 51 |         result = s.connect_ex((host, port))
 52 |         if result == 0:
 53 |             s.close()
 54 |             break
 55 |         time.sleep(5)
 56 |     end_time = datetime.now()
 57 |     print("Port {port} is now available. Waited {t} seconds.".format(
 58 |         port=port,
 59 |         t=(end_time - start_time).seconds
 60 |     ))
 61 | 
 62 | 
 63 | def wait_for_cluster_state(conn, cluster_instances, cluster_state="running", name="master"):
 64 |     """
 65 |     Wait for all the instances in the cluster to reach a designated state.
 66 |     cluster_instances: a list of boto.ec2.instance.Instance
 67 |     cluster_state: a string representing the desired state of all the instances in the cluster
 68 |            value can be valid value from boto.ec2.instance.InstanceState such as
 69 |            'running', 'terminated', etc.
 70 |     """
 71 |     sys.stdout.write(
 72 |         "Waiting for {n} to enter '{s}' state.".format(n=name, s=cluster_state)
 73 |     )
 74 |     sys.stdout.flush()
 75 |     start_time = datetime.now()
 76 |     while True:
 77 |         for i in cluster_instances:
 78 |             i.update()
 79 |         max_batch = 100
 80 |         statuses = []
 81 |         for j in xrange(0, len(cluster_instances), max_batch):
 82 |             batch = [i.id for i in cluster_instances[j:j + max_batch]]
 83 |             statuses.extend(conn.get_all_instance_status(instance_ids=batch))
 84 |         if all(i.state == cluster_state for i in cluster_instances):
 85 |             break
 86 |         sys.stdout.write(".")
 87 |         sys.stdout.flush()
 88 |         time.sleep(5)
 89 |     sys.stdout.write("\n")
 90 |     end_time = datetime.now()
 91 |     print("Cluster is now in '{s}' state. Waited {t} seconds.".format(
 92 |         s=cluster_state,
 93 |         t=(end_time - start_time).seconds
 94 |     ))
 95 | 
 96 | 
 97 | def setup_security_groups(conn, cluster_name, opts):
 98 |     print("Setting up security groups...")
 99 |     master_group = get_or_make_group(
100 |         conn, cluster_name + "-master", opts.vpc_id)
101 |     worker_group = get_or_make_group(
102 |         conn, cluster_name + "-workers", opts.vpc_id)
103 |     authorized_address = opts.authorized_address
104 |     if master_group.rules == []:  # Group was just now created
105 |         if opts.vpc_id is None:
106 |             master_group.authorize(src_group=master_group)
107 |             master_group.authorize(src_group=worker_group)
108 |         else:
109 |             master_group.authorize(ip_protocol='-1', from_port=None, to_port=None,
110 |                                    src_group=master_group)
111 |             master_group.authorize(ip_protocol='-1', from_port=None, to_port=None,
112 |                                    src_group=worker_group)
113 |         master_group.authorize('tcp', 0, 65535, authorized_address)
114 |     if worker_group.rules == []:  # Group was just now created
115 |         if opts.vpc_id is None:
116 |             worker_group.authorize(src_group=master_group)
117 |             worker_group.authorize(src_group=worker_group)
118 |         else:
119 |             worker_group.authorize(ip_protocol='-1', from_port=None, to_port=None,
120 |                                   src_group=master_group)
121 |             worker_group.authorize(ip_protocol='-1', from_port=None, to_port=None,
122 |                                   src_group=worker_group)
123 |         worker_group.authorize('tcp', 0, 65535, authorized_address)
124 |     return (master_group, worker_group)
125 | 
126 | 
127 | def delete_security_groups(conn, cluster_name):
128 |     print("Deleting security groups...")
129 |     master_group = get_group(conn, cluster_name + "-master")
130 |     # TODO: deprecate this in Jan 2016
131 |     slave_group = get_group(conn, cluster_name + "-slaves")
132 |     worker_group = get_group(conn, cluster_name + "-workers")
133 |     groups = master_group + worker_group + slave_group
134 |     success = True
135 |     for group in groups:
136 |         print("Deleting rules in security group " + group.name)
137 |         for rule in group.rules:
138 |             for grant in rule.grants:
139 |                 success &= conn.revoke_security_group(group_id=group.id, ip_protocol=rule.ip_protocol,
140 |                                                       from_port=rule.from_port, to_port=rule.to_port,
141 |                                                       src_security_group_group_id=grant.group_id, cidr_ip=grant.cidr_ip)
142 |     time.sleep(2)
143 |     for group in groups:
144 |         try:
145 |             conn.delete_security_group(group_id=group.id)
146 |             print("Deleted security group %s" % group.name)
147 |         except boto.exception.EC2ResponseError, e:
148 |             success = False
149 |             print("Failed to delete security group %s" % group.name)
150 |             print(e)
151 |     if not success:
152 |         print("Failed to delete all security groups, try again later")
153 | 
154 | 
155 | def parse_options():
156 |     parser = OptionParser(
157 |         usage="%prog [options] <action> <cluster_name>\n\n"
158 |               + "<action> can be: launch, destroy")
159 |     parser.add_option(
160 |         "-k", "--key-pair", default=None,
161 |         help="Key pair to use on instances")
162 |     parser.add_option(
163 |         "-i", "--identity-file",
164 |         help="SSH private key file to use for logging into instances")
165 |     parser.add_option(
166 |         "-r", "--region", default="us-east-1",
167 |         help="EC2 region used to launch instances in, or to find them in (default: %default)")
168 |     parser.add_option(
169 |         "-a", "--ami", default="ami-cdf3bea7",
170 |         help="Amazon Machine Image ID to use")
171 |     parser.add_option(
172 |         "--authorized-address", type="string", default="0.0.0.0/0",
173 |         help="Address to authorize on created security groups (default: %default)")
174 |     parser.add_option(
175 |         "-t", "--instance-type", default="m3.medium",
176 |         help="Type of instance to launch (default: %default). ")
177 |     parser.add_option(
178 |         "-m", "--master-instance-type", default="m3.medium",
179 |         help="Master instance type (default: %default)")
180 |     
181 |     parser.add_option(
182 |         "-u", "--scale-up-nodes-amount", type="int", default="5",
183 |         help="Number of nodes to scale up by when scale up alarm is triggered (default %default)")
184 |     parser.add_option(
185 |         "-d", "--scale-down-nodes-amount", type="int", default="1",
186 |         help="Number of nodes to scale down by when scale down alarm is triggered (default %default)")
187 | 
188 |     parser.add_option(
189 |         "-U", "--scale-up-cooldown", type="int", default="60",
190 |         help="The amount of time, in seconds, after a scale up activity completes before any further " +
191 |              "scale up actions can occur. See the following link for more information " + 
192 |              "http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/Cooldown.html#cooldowns-scaling-specific (default %default)")
193 |     parser.add_option(
194 |         "-D", "--scale-down-cooldown", type="int", default="60",
195 |         help="The amount of time, in seconds, after a scale down activity completes before any further " +
196 |              "scale down actions can occur. See the following link for more information " + 
197 |              "http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/Cooldown.html#cooldowns-scaling-specific (default %default)")
198 | 
199 |     parser.add_option(
200 |         "-n", "--min-instances", type="int", default="2",
201 |         help="Minimum number of instances for the auto-scaling group (default %default)")
202 |     parser.add_option(
203 |         "-x", "--max-instances", type="int", default="8",
204 |         help="Maximum number of instances for the auto-scaling group (default %default)")
205 | 
206 |     parser.add_option(
207 |         "--max-spot-price", metavar="PRICE", type="float",
208 |         help="If specified, launch workers as spot instances with the given " +
209 |              "maximum price (in dollars). The actual price paid will be the market price " +
210 |              "so potentially less than this value. If the market price exceeds this value " +
211 |              "your nodes will shut down and no more will spin up")
212 |     parser.add_option(
213 |         "--subnet-id", default=None,
214 |         help="VPC subnet to launch instances in")
215 |     parser.add_option(
216 |         "--vpc-id", default=None,
217 |         help="VPC to launch instances in")
218 |     parser.add_option(
219 |         "-z", "--zone", default=None,
220 |         help="Availability zone to launch instances in")
221 | 
222 |     (opts, args) = parser.parse_args()
223 |     if len(args) != 2:
224 |         parser.print_help()
225 |         sys.exit(1)
226 | 
227 |     (action, cluster_name) = args
228 |     return (opts, action, cluster_name)
229 | 
230 | 
231 | def find_instance_by_name(conn, name):
232 |     reservations = conn.get_all_instances()
233 |     instances = [i for r in reservations for i in r.instances]
234 |     for i in instances:
235 |         if "Name" in i.tags and name == i.tags['Name']:
236 |             return i
237 |     return None
238 | 
239 | 
240 | def start_master(conn, opts, cluster_name, master_group):
241 |     try:
242 |         conn.get_all_images(image_ids=[opts.ami])[0]
243 |     except boto.exception.EC2ResponseError:
244 |         print("Could not find AMI " + opts.ami)
245 |         sys.exit(1)
246 |     if opts.vpc_id:
247 |         interface = NetworkInterfaceSpecification(subnet_id=opts.subnet_id,
248 |                                                   groups=[master_group.id],
249 |                                                   associate_public_ip_address=True)
250 |         interfaces = NetworkInterfaceCollection(interface)
251 |         security_group_ids = None
252 |     else:
253 |         interfaces = None
254 |         security_group_ids = [master_group.id]
255 |     master_res = conn.run_instances(
256 |         image_id=opts.ami,
257 |         key_name=opts.key_pair,
258 |         instance_type=opts.master_instance_type,
259 |         placement=opts.zone,
260 |         min_count=1,
261 |         max_count=1,
262 |         network_interfaces=interfaces,
263 |         security_group_ids=security_group_ids)
264 |     instance = master_res.instances[0]
265 |     time.sleep(1)
266 |     conn.create_tags(
267 |         [instance.id], {"Name": "{c}-master".format(c=cluster_name)})
268 |     return instance
269 | 
270 | 
271 | def validate_opts(conn, opts, action):
272 |     if opts.zone is None and opts.vpc_id is None:
273 |         opts.zone = random.choice(conn.get_all_zones()).name
274 |     if opts.vpc_id is not None and opts.zone is None:
275 |         print("please specify zone with vpc_id")
276 |         sys.exit(1)
277 |     if opts.zone is None:
278 |         print("please specify zone (--zone)")
279 |         sys.exit(1)
280 |     if action == "launch":
281 |         if opts.key_pair is None:
282 |             print("please specify keypair (-k)")
283 |             sys.exit(1)
284 |     return opts
285 | 
286 | 
287 | def create_autoscaling_group(autoscale, cluster_name, master_node, opts, slave_group):
288 |     lclist = autoscale.get_all_launch_configurations(
289 |         names=[cluster_name + "-lc"])
290 |     if lclist:
291 |         lc = lclist[0]
292 |     else:
293 |         lc = LaunchConfiguration(
294 |             name=cluster_name + "-lc",
295 |             image_id=opts.ami,
296 |             key_name=opts.key_pair,
297 |             security_groups=[slave_group.id],
298 |             instance_type=opts.instance_type,
299 |             user_data="SPARK_MASTER=" + master_node.private_dns_name + "\n",
300 |             instance_monitoring=True,
301 |             spot_price=opts.max_spot_price)
302 |         autoscale.create_launch_configuration(lc)
303 |     aglist = autoscale.get_all_groups(names=[cluster_name + "-ag"])
304 |     if aglist:
305 |         ag = aglist[0]
306 |     else:
307 |         ag = AutoScalingGroup(group_name=cluster_name + "-ag",
308 |                               launch_config=lc,
309 |                               min_size=opts.min_instances,
310 |                               max_size=opts.max_instances,
311 |                               connection=autoscale,
312 |                               vpc_zone_identifier=opts.subnet_id,
313 |                               availability_zones=[opts.zone])
314 |         autoscale.create_auto_scaling_group(ag)
315 |     as_tag = boto.ec2.autoscale.Tag(key='Name',
316 |                                     value=cluster_name + '-worker',
317 |                                     propagate_at_launch=True,
318 |                                     resource_id=cluster_name + "-ag")
319 |     autoscale.create_or_update_tags([as_tag])
320 | 
321 | 
322 | def create_autoscaling_policy(autoscale, cluster_name, opts):
323 |     scale_up_policy = ScalingPolicy(
324 |         name='scale_up', adjustment_type='ChangeInCapacity',
325 |         as_name=cluster_name + "-ag", scaling_adjustment=opts.scale_up_nodes_amount, cooldown=opts.scale_up_cooldown)
326 |     scale_down_policy = ScalingPolicy(
327 |         name='scale_down', adjustment_type='ChangeInCapacity',
328 |         as_name=cluster_name + "-ag", scaling_adjustment=-opts.scale_down_nodes_amount, cooldown=opts.scale_down_cooldown)
329 |     autoscale.create_scaling_policy(scale_up_policy)
330 |     autoscale.create_scaling_policy(scale_down_policy)
331 |     scale_up_policy = autoscale.get_all_policies(
332 |         as_group=cluster_name + "-ag", policy_names=['scale_up'])[0]
333 |     scale_down_policy = autoscale.get_all_policies(
334 |         as_group=cluster_name + "-ag", policy_names=['scale_down'])[0]
335 |     alarm_dimensions = {"AutoScalingGroupName": cluster_name + "-ag"}
336 |     cloudwatch = boto.ec2.cloudwatch.connect_to_region(opts.region)
337 |     scale_up_alarm = MetricAlarm(
338 |         name='scale_up_on_cpu', namespace='AWS/EC2',
339 |         metric='CPUUtilization', statistic='Average',
340 |         comparison='>', threshold='50',
341 |         period='60', evaluation_periods=1,
342 |         alarm_actions=[scale_up_policy.policy_arn],
343 |         dimensions=alarm_dimensions)
344 |     cloudwatch.create_alarm(scale_up_alarm)
345 |     scale_down_alarm = MetricAlarm(
346 |         name='scale_down_on_cpu', namespace='AWS/EC2',
347 |         metric='CPUUtilization', statistic='Average',
348 |         comparison='<', threshold='40',
349 |         period='60', evaluation_periods=1,
350 |         alarm_actions=[scale_down_policy.policy_arn],
351 |         dimensions=alarm_dimensions)
352 |     cloudwatch.create_alarm(scale_down_alarm)
353 | 
354 | 
355 | def main():
356 |     (opts, action, cluster_name) = parse_options()
357 |     conn = boto.ec2.connect_to_region(opts.region)
358 |     opts = validate_opts(conn, opts, action)
359 | 
360 |     if action == "launch":
361 |         (master_group, slave_group) = setup_security_groups(conn, cluster_name, opts)
362 |         master_node = find_instance_by_name(conn, cluster_name + '-master')
363 |         if not master_node:
364 |             master_node = start_master(conn, opts, cluster_name, master_group)
365 |         print("Master node: {m}".format(m=master_node))
366 |         wait_for_cluster_state(
367 |             conn=conn,
368 |             cluster_instances=([master_node]),
369 |         )
370 |         autoscale = boto.ec2.autoscale.connect_to_region(opts.region)
371 |         create_autoscaling_group(autoscale, cluster_name, master_node, opts, slave_group)
372 |         create_autoscaling_policy(autoscale, cluster_name, opts)
373 | 
374 |         wait_for_tcp_port(master_node.public_dns_name)
375 |         print("SSH ready:")
376 |         print("ssh ubuntu@{h}".format(h=master_node.public_dns_name))
377 |         wait_for_tcp_port(master_node.public_dns_name, port=18080)
378 |         print("Spark master ready:")
379 |         print(
380 |             "Spark WebUI: http://{h}:18080".format(h=master_node.public_dns_name))
381 |     if action == "destroy":
382 |         master_node = find_instance_by_name(conn, cluster_name + '-master')
383 |         if master_node:
384 |             print("Terminating master...")
385 |             conn.create_tags([master_node.id], {"Name": "{c}-master-terminated".format(c=cluster_name)})
386 |             master_node.terminate()
387 |         print("Shutting down autoscaling group...")
388 |         autoscale = boto.ec2.autoscale.connect_to_region(opts.region)
389 |         aglist = autoscale.get_all_groups(names=[cluster_name + "-ag"])
390 |         ag = None
391 |         if aglist:
392 |             ag = aglist[0]
393 |             ag.shutdown_instances()
394 |             instances_ids = [i.instance_id for i in ag.instances]
395 |             instances = conn.get_only_instances(instances_ids)
396 |         else:
397 |             instances = []
398 |         lclist = autoscale.get_all_launch_configurations(names=[cluster_name + "-lc"])
399 |         lc = None
400 |         if lclist:
401 |             lc = lclist[0]
402 |         wait_for_cluster_state(
403 |             conn, instances, cluster_state="terminated", name="instances")
404 |         time.sleep(10)
405 |         if ag:
406 |             try:
407 |                 ag.delete()
408 |             except Exception, e:
409 |                 print("Couldn't delete autoscaling group: %s" % e)
410 |         if lc:
411 |             try:
412 |                 lc.delete()
413 |             except Exception, e:
414 |                 print("Couldn't delete launch configuration: %s" % e)
415 |         delete_security_groups(conn, cluster_name)
416 |         print("All done.")
417 | 
418 | 
419 | if __name__ == "__main__":
420 |     main()
421 | 


--------------------------------------------------------------------------------