├── bin
    ├── core
    │   ├── __init__.py
    │   ├── nginx.py
    │   ├── common.py
    │   ├── config_nginx.py
    │   ├── ecs.py
    │   └── utils.py
    ├── service
    │   ├── __init__.py
    │   ├── spark_notebook.py
    │   ├── hue.py
    │   ├── spark.py
    │   └── hdfs.py
    ├── conf
    │   └── nginx.conf.template
    ├── sh
    │   └── mount_disk.sh
    └── spark_ecs.py
├── ecs-image-list
    ├── cn-beijing
    │   └── ecs-image-id
    ├── cn-hangzhou
    │   └── ecs-image-id
    ├── cn-qingdao
    │   └── ecs-image-id
    └── cn-shenzhen
    │   └── ecs-image-id
├── README.md
├── doc
    ├── ssh_tunnel.md
    └── manual.md
└── LICENSE


/bin/core/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/bin/service/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ecs-image-list/cn-beijing/ecs-image-id:
--------------------------------------------------------------------------------
1 | m-25dt21m47


--------------------------------------------------------------------------------
/ecs-image-list/cn-hangzhou/ecs-image-id:
--------------------------------------------------------------------------------
1 | m-23xecoatf


--------------------------------------------------------------------------------
/ecs-image-list/cn-qingdao/ecs-image-id:
--------------------------------------------------------------------------------
1 | m-28w0wqwa6


--------------------------------------------------------------------------------
/ecs-image-list/cn-shenzhen/ecs-image-id:
--------------------------------------------------------------------------------
1 | m-94ksoicp4


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #		Spark on ECS
 2 | 
 3 | ##		介绍
 4 | 1. [脚本使用指南](doc/manual.md)
 5 | 2. [SSH隧道配置](doc/ssh_tunnel.md)
 6 | 
 7 | ##		特性
 8 | --------
 9 | 1. 快速搭建基于ECS的Spark集群    
10 |    通过脚本一键式快速搭建高效、稳定的Spark集群运行环境，并可以通过命令行快捷、简单的管理集群的生命周期
11 | 2. Spark工具支持    
12 |    运行环境集成Spark-notebook、Hue等，并开放原生的Spark UI，可以方便快捷的通过图形界面运行、调试代码和监控作业
13 | 3. 安全    
14 |    所有Web页面的访问均采用SSH隧道技术，可以在非安全环境下使用端口转发来加密Web页面的访问，保护个人隐私以及重要商业信息
15 | 4. 开放    
16 |    所有软件均使用社区开源版本，代码开放透明、文档丰富
17 |    
18 |    


--------------------------------------------------------------------------------
/bin/conf/nginx.conf.template:
--------------------------------------------------------------------------------
 1 | worker_processes  1;
 2 | error_log  logs/error.log;
 3 | error_log  logs/error.log  notice;
 4 | error_log  logs/error.log  info;
 5 | pid        logs/nginx.pid;
 6 | 
 7 | events {
 8 |     worker_connections  1024;
 9 | }
10 | 
11 | http {
12 |     include         mime.types;
13 |     default_type    pplication/octet-stream;
14 |     sendfile        off;
15 |     keepalive_timeout  20;
16 |     gzip  on;
17 |     proxy_intercept_errors  off;
18 | 
19 |     ${upstream_place_holder}
20 | 
21 |     ${server_place_holder}
22 | }


--------------------------------------------------------------------------------
/bin/service/spark_notebook.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import os
 4 | import sys
 5 | from core import ecs, utils
 6 | from core.common import GlobalVar
 7 | 
 8 | def start_spark_notebook(masters, opts):
 9 |     print "==> Starting Spark Notebook service..."
10 |     master = masters[0]
11 |     ins = ecs.get_instance_info(master)
12 |     ip = ins['InnerIpAddress']['IpAddress'][0]
13 |     launch_notebook = ' \" cd %s; nohup ./bin/spark-notebook -Dhttp.port=9090 > /dev/null 2>&1 & \" ' \
14 |                       % GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR
15 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_notebook))
16 |     print "==> Started Spark Notebook service successfully..."
17 | 
18 | def stop_spark_notebook(masters, opts):
19 |     print "==> Stopping Spark Notebook..."
20 |     master = masters[0]
21 |     ins = ecs.get_instance_info(master)
22 |     ip = ins['InnerIpAddress']['IpAddress'][0]
23 |     stop_notebook = ' \" cd %s; cat RUNNING_PID | xargs -r kill -9; rm -f RUNNING_PID \" ' \
24 |                     % GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR
25 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_notebook))
26 |     print "==> Stopped Spark Notebook service successfully..."
27 | 


--------------------------------------------------------------------------------
/bin/sh/mount_disk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p /amrdata
 4 | 
 5 | if which mkfs.ext4 > /dev/null ;then
 6 | 	if ls /dev/xvdb1 > /dev/null;then
 7 | 	   if cat /etc/fstab|grep /amrdata > /dev/null ;then
 8 | 			if cat /etc/fstab|grep /amrdata|grep ext3 > /dev/null ;then
 9 | 				sed -i "/\/amrdata/d" /etc/fstab
10 | 				echo '/dev/xvdb1             /amrdata                 ext4    defaults        0 0' >> /etc/fstab
11 | 			fi
12 | 	   else
13 | 			echo '/dev/xvdb1             /amrdata                 ext4    defaults        0 0' >> /etc/fstab
14 | 	   fi
15 | 	   mount -a
16 | 	   echo ""
17 | 	   exit;
18 | 	else
19 | 		if ls /dev/xvdb ;then
20 | fdisk /dev/xvdb << EOF
21 | n
22 | p
23 | 1
24 | 
25 | 
26 | wq
27 | EOF
28 | 			mkfs.ext4 /dev/xvdb1
29 | 			echo '/dev/xvdb1             /amrdata                 ext4    defaults        0 0' >> /etc/fstab
30 | 		fi
31 | 	fi
32 | else
33 | 	if ls /dev/xvdb1 > /dev/null;then
34 | 	   if cat /etc/fstab|grep /amrdata > /dev/null ;then
35 | 			echo ""
36 | 	   else
37 | 			echo '/dev/xvdb1             /amrdata                 ext3    defaults        0 0' >> /etc/fstab
38 | 	   fi
39 | 	   mount -a
40 | 	   echo ""
41 | 	   exit;
42 | 	else
43 | 		if ls /dev/xvdb ;then
44 | fdisk /dev/xvdb << EOF
45 | n
46 | p
47 | 1
48 | 
49 | 
50 | wq
51 | EOF
52 | 			mkfs.ext3 /dev/xvdb1
53 | 			echo '/dev/xvdb1             /amrdata                 ext3    defaults        0 0' >> /etc/fstab
54 | 		fi
55 | 	fi
56 | fi
57 | 
58 | mount -a


--------------------------------------------------------------------------------
/bin/core/nginx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import sys
 4 | import os
 5 | import utils
 6 | from core.common import GlobalVar
 7 | from config_nginx import generate_config_file
 8 | 
 9 | def copy_file(opts, src_file, ip, dst):
10 |     try:
11 |         os.system("sshpass -p %s scp -r %s %s %s@%s:%s" % (opts.pwd, " ".join(utils.ssh_args()), src_file, opts.user, ip, dst))
12 |     except Exception as e:
13 |         print(e.message)
14 |         raise e
15 | 
16 | def execute_remote_command(opts, ip, command):
17 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, command))
18 | 
19 | def execute_local_command(command):
20 |     os.system(command)
21 | 
22 | def start_nginx(opts, host_info_file, ip):
23 |     try:
24 |         nginx_config_template_file = "%s/conf/nginx.conf.template" % GlobalVar.SPARK_ECS_DIR
25 |         local_nginx_config = "%s/conf/nginx.conf" % GlobalVar.SPARK_ECS_DIR
26 |         dst = "/opt/nginx-1.9.1/conf/nginx.conf"
27 |         generate_config_file(host_info_file, nginx_config_template_file, local_nginx_config)
28 |         copy_file(opts, local_nginx_config, ip, dst)
29 |         start_nginx_command = "/opt/nginx-1.9.1/sbin/nginx"
30 |         execute_remote_command(opts, ip, start_nginx_command)
31 |         return 1
32 |     except Exception as e:
33 |         print "start nginx failed %s" % str(e.message)
34 |         return -1
35 | 
36 | def do_stop_nginx(opts,ip):
37 |     try:
38 |         stopNginxCommand = "/opt/nginx-1.9.1/sbin/nginx -s stop"
39 |         execute_remote_command(opts, ip, stopNginxCommand)
40 |         return 1
41 |     except Exception as e:
42 |         print "stop nginx filed "+str(e.message)
43 |         return -1
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/bin/core/common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import os
 4 | import sys
 5 | 
 6 | class GlobalVar:
 7 | 
 8 |     DEFAULT_CONF_DIR = "/root/.config"
 9 |     PROPERTY_FILE = "%s/packages.property" % DEFAULT_CONF_DIR
10 |     HADOOP_INSTALL_DIR = "/opt/hadoop"
11 |     HADOOP_CONF_DIR = "%s/etc/hadoop" % HADOOP_INSTALL_DIR
12 |     SPARK_INSTALL_DIR = "/opt/spark"
13 |     SPARK_CONF_DIR = "%s/conf" % SPARK_INSTALL_DIR
14 |     SPARK_NOTEBOOK_INSTALL_DIR = "/opt/spark-notebook"
15 |     HUE_INSTALL_DIR = "/opt/hue"
16 |     ALIYUN_SDK_URL = "http://docs-aliyun-com-cn-b.oss-cn-hangzhou.aliyuncs.com/ecs/assets/sdk/python_sdk.tgz"
17 |     SPARK_ECS_DIR = ""
18 |     CLUSTER_STATUS = "%s/status/cluster-" % DEFAULT_CONF_DIR
19 |     CLUSTER_INSTANCES = "%s/instances/" % DEFAULT_CONF_DIR
20 |     CLUSTER_HOSTS = ""
21 | 
22 |     ECS_API_PAGESIZE = 50
23 | 
24 |     ECS_INSTANCE_TYPE = {
25 |         "ecs.t1.small": (1, 1),
26 |         "ecs.s1.small": (1, 2),
27 |         "ecs.s1.medium": (1, 4),
28 |         "ecs.s2.small": (2, 2),
29 |         "ecs.s2.large": (2, 4),
30 |         "ecs.s2.xlarge": (2, 8),
31 |         "ecs.s3.medium": (4, 4),
32 |         "ecs.s3.large": (4, 8),
33 |         "ecs.m1.medium": (4, 16)
34 |     }
35 | 
36 |     ECS_REGION = {
37 |         "1": "cn-hangzhou",
38 |         "2": "cn-shenzhen",
39 |         "3": "cn-beijing",
40 |         "4": "cn-qingdao"
41 |     }
42 | 
43 |     SPARK_IMAGES = {
44 |         ("Spark-1.3.1", "cn-hangzhou"): "m-23xecoatf",
45 |         ("Spark-1.3.1", "cn-shenzhen"): "m-94ksoicp4",
46 |         ("Spark-1.3.1", "cn-beijing"):  "m-25dt21m47",
47 |         ("Spark-1.3.1", "cn-qingdao"):  "m-28w0wqwa6"
48 |     }
49 | 
50 |     AVAILABLE_SAPRK_VERSION = {
51 |         "1": "Spark-1.3.1"
52 |     }
53 | 


--------------------------------------------------------------------------------
/bin/service/hue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import os
 4 | import sys
 5 | from core import ecs, utils
 6 | from core.common import GlobalVar
 7 | 
 8 | def start_hue(masters, opts):
 9 |     print "==> Starting HUE service..."
10 |     master = masters[0]
11 |     ins = ecs.get_instance_info(master)
12 |     ip = ins['InnerIpAddress']['IpAddress'][0]
13 |     copy_command = ' \"/bin/cp -r %s/hue/desktop/conf/hue.ini %s/desktop/conf/ \"' \
14 |                    % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.HUE_INSTALL_DIR)
15 |     launch_hue_step1 = ' \"source /root/.bash_profile; cd %s/build/env/bin/; nohup ./hue livy_server > /dev/null 2>&1 & \" ' \
16 |                        % GlobalVar.HUE_INSTALL_DIR
17 |     launch_hue_step2 = ' \"source /root/.bash_profile; cd %s/build/env/bin/; nohup ./supervisor > /dev/null 2>&1 & \" ' \
18 |                        % GlobalVar.HUE_INSTALL_DIR
19 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, copy_command))
20 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_hue_step1))
21 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_hue_step2))
22 |     print "==> Started HUE service successfully"
23 | 
24 | def stop_hue(masters, opts):
25 |     print "==> Stopping HUE service..."
26 |     master = masters[0]
27 |     ins = ecs.get_instance_info(master)
28 |     ip = ins['InnerIpAddress']['IpAddress'][0]
29 |     stop_hue_step1 = ' \" pgrep supervisor | xargs -r kill -9 \" '
30 |     stop_hue_step2 = ' \" ps -ef | grep livy.server.Main | grep -v grep | awk \'{print \$2}\' | xargs -r kill -9 \" '
31 |     stop_hue_step3 = ' \" pgrep hue | xargs -r kill -9 \" '
32 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step1))
33 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step2))
34 |     os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step3))
35 |     print "==> Stopped HUE service successfully"
36 | 


--------------------------------------------------------------------------------
/bin/service/spark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import sys
 4 | import os
 5 | from core import ecs, utils
 6 | from core.common import GlobalVar
 7 | 
 8 | def setup_cluster(masters, slaves, opts, deploy_ssh_key):
 9 |     master = masters[0]
10 |     if deploy_ssh_key:
11 |         print "==> Generating cluster's SSH key on master..."
12 |         key_setup = """
13 |           [ -f ~/.ssh/id_rsa ] ||
14 |             (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
15 |              cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)
16 |         """
17 |         utils.do_ssh(master, opts, key_setup)
18 |         dot_ssh_tar = utils.ssh_read(master, opts, ['tar', 'c', '.ssh'])
19 |         print "==> Transferring cluster's SSH key to slaves..."
20 |         for slave in slaves:
21 |             utils.ssh_write(slave, opts, ['tar', 'x'], dot_ssh_tar)
22 | 
23 |     print "==> Updating /etc/hosts for each ECS instance..."
24 |     utils.prepare_hosts(master, slaves, opts)
25 | 
26 |     print "==> Updating Spark default configuration..."
27 |     # copy default hadoop config
28 |     os.system(" /bin/cp -r %s/spark/conf/* %s"
29 |               % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.SPARK_CONF_DIR))
30 |     utils.do_scp(masters[0], opts, GlobalVar.SPARK_CONF_DIR, GlobalVar.SPARK_INSTALL_DIR)
31 |     for slave in slaves:
32 |         utils.do_scp(slave, opts, GlobalVar.SPARK_CONF_DIR, GlobalVar.SPARK_INSTALL_DIR)
33 | 
34 |     print "==> Starting spark cluster..."
35 |     start_spark_cluster(master, slaves, opts)
36 | 
37 | def start_spark_cluster(master, slaves, opts):
38 |     ins = ecs.get_instance_info(master)
39 |     master_name = ins['HostName']
40 |     start_master = "%s/sbin/start-master.sh " % GlobalVar.SPARK_INSTALL_DIR
41 |     utils.do_ssh(master, opts, str(start_master))
42 |     for slave in slaves:
43 |         instance_info = ecs.get_instance_info(slave)
44 |         worker_name = instance_info['HostName']
45 |         start_slave = "%s/sbin/start-slave.sh %s spark://%s:7077" \
46 |                       % (GlobalVar.SPARK_INSTALL_DIR, worker_name, master_name)
47 |         utils.do_ssh(slave, opts, str(start_slave))
48 |     print "==> Started spark cluster successfully!"
49 | 
50 | def stop_spark_cluster(masters, slaves, opts):
51 |     master = masters[0]
52 |     stop_master = "%s/sbin/stop-master.sh " % GlobalVar.SPARK_INSTALL_DIR
53 |     print "==> Stopping Spark Master..."
54 |     utils.do_ssh(master, opts, str(stop_master))
55 | 
56 |     print "==> Stopping Spark Slaves..."
57 |     for slave in slaves:
58 |         instance_info = ecs.get_instance_info(slave)
59 |         worker_name = instance_info['HostName']
60 |         stop_slave = "%s/sbin/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker %s" \
61 |                      % (GlobalVar.SPARK_INSTALL_DIR, worker_name)
62 |         utils.do_ssh(slave, opts, str(stop_slave))


--------------------------------------------------------------------------------
/bin/service/hdfs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import os
 4 | import sys
 5 | from core import ecs, utils
 6 | from core.common import GlobalVar
 7 | 
 8 | def setup_hdfs(masters, slaves, opts):
 9 |     print "==> Updating Hadoop configuration for each ECS instance..."
10 |     # copy default hadoop config
11 |     os.system(" /bin/cp -r %s/hadoop/etc/hadoop/* %s/etc/hadoop/"
12 |               % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.HADOOP_INSTALL_DIR))
13 | 
14 |     master_intranet_ip = ecs.get_instance_info(masters[0])['InnerIpAddress']['IpAddress'][0]
15 |     namenode = "hdfs://%s:9000" % master_intranet_ip
16 |     utils.update_hadoop_configuration(namenode)
17 |     utils.do_scp(masters[0], opts, GlobalVar.HADOOP_CONF_DIR, "%s/etc/" % GlobalVar.HADOOP_INSTALL_DIR)
18 |     for slave in slaves:
19 |         utils.do_scp(slave, opts, GlobalVar.HADOOP_CONF_DIR, "%s/etc/" % GlobalVar.HADOOP_INSTALL_DIR)
20 | 
21 |     print "==> Starting HDFS service..."
22 |     start_hdfs(masters[0], slaves, opts)
23 |     print "==> Started HDFS service successfully"
24 | 
25 | def start_hdfs(master, slaves, opts):
26 |     utils.warning()
27 |     msg = "If this is the first time, you need to format HDFS, otherwise you should not format it! \n" \
28 |           "Format HDFS (Y/n): "
29 |     confirm = raw_input(msg)
30 |     if confirm == 'Y':
31 |         msg = "Confirm to format HDFS? (Y/n): "
32 |         confirm_again = raw_input(msg)
33 |         if confirm_again == "Y":
34 |             print "==> Formatting HDFS..."
35 |             format_hdfs = "%s/bin/hdfs namenode -format -force 2> /dev/null" % GlobalVar.HADOOP_INSTALL_DIR
36 |             utils.do_ssh(master, opts, str(format_hdfs))
37 |         else:
38 |             print "==> Not `Y`, skipping formatting HDFS..."
39 |     else:
40 |         print "==> Not `Y`, skipping formatting HDFS..."
41 | 
42 |     print "==> Starting namenode..."
43 |     start_namenode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs start namenode" \
44 |                      % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR)
45 |     utils.do_ssh(master, opts, start_namenode)
46 | 
47 |     print "==> Starting datanode..."
48 |     for slave in slaves:
49 |         start_datanode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs start datanode" \
50 |                          % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR)
51 |         utils.do_ssh(slave, opts, start_datanode)
52 | 
53 | def stop_hdfs(masters, slaves, opts):
54 |     print "==> Stopping namenode..."
55 |     master = masters[0]
56 |     stop_namenode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs stop namenode" \
57 |                     % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR)
58 |     utils.do_ssh(master, opts, stop_namenode)
59 | 
60 |     print "==> Stopping datanodes..."
61 |     for slave in slaves:
62 |         stop_datanode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs stop datanode" \
63 |                         % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR)
64 |         utils.do_ssh(slave, opts, stop_datanode)
65 |     print "==> Stopped HDFS service successfully"
66 | 


--------------------------------------------------------------------------------
/doc/ssh_tunnel.md:
--------------------------------------------------------------------------------
  1 | # 打通SSH隧道
  2 | 
  3 | 打通`本机	<-->	Spark Master`, 以便在本机访问Spark UI, Hue, Spark Notebook.		
  4 | 
  5 | 要连接主节点的 SparkUI、HUE、Spark-notebook的UI界面，需要创建本机到Spark主节点的SSH隧道，以本地端口转发到远程端口的安全的方式访问。具体的创建步骤如下：
  6 | 
  7 | ## SSH客户端配置
  8 | 
  9 | 支持PuTTY（windows）或OpenSSH（linux、Max OSX）
 10 | 
 11 | ###	windows相关配置
 12 | 
 13 | 1. 首先[下载PuTTY](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html)
 14 | 2. 配置PuTTY    
 15 | 	*	首先创建一个session并配置好Master的IP地址和22端口号并保存session。这一步的目的是能连接到SSH Server建立一个SSH通道    
 16 | 		![](http://i.imgur.com/AgmjuGL.jpg)
 17 | 	*	切换到Tunnel面板，分别配置Source Port和 Destination的IP端口，然后点击Add保存端口转发映射    
 18 | 		![](http://i.imgur.com/MWOj90s.jpg)
 19 | 
 20 | 3. 点击open按钮，输入用户名密码登陆
 21 | 	这样就建立好了一个带有端口转发的SSH隧道。访问`http://127.0.0.1:8888`端口的请求就会被转发到远程机器的`9000`端口。通过此方式，就可以安全的访问Spark UI、spark-notebook、和HUE的页面了。
 22 | 
 23 | ###	Linux相关配置
 24 | 
 25 | 1.	安装openssh （ECS默认都有安装）
 26 | 2. 执行命令 `ssh -N -f -L port1:127.0.0.1:port2 username@ip`
 27 | 
 28 | *参数说明*
 29 | 
 30 | 参数 | 描述
 31 | ------------ | -------------
 32 | -N | 参数告诉SSH客户端，改命令仅仅做端口转发
 33 | -f|告诉SSH客户端在后台运行
 34 | -L|做本地映射端口
 35 | port1|要使用的本地端口
 36 | port2|要映射的远程端口
 37 | username|登陆远程机器的用户名
 38 | ip|要建立通道的远程机器的IP
 39 | 
 40 | 
 41 | >	连接成功后，在浏览器访问 127.0.0.1：port1 就可以被转发到服务器的 ip：port2端口了     
 42 | >	因为直接访问服务的器的目标端口是被防火墙屏蔽的，所以SSH隧道技术，可以绕过防火墙的设置，并提供了一个安全访问的机制。
 43 | 
 44 | ## 使用SparkUI、spark-notebook、Hue
 45 | 
 46 | 请确保上文中SSH隧道能够打通
 47 | 
 48 | ###	Web服务的端口映射绑定
 49 | 	
 50 | SparkUI的配置：
 51 | 
 52 | ####	Linux命令行执行如下命令
 53 | 1. 将本地`8081`绑定到远程`80`端口      
 54 | 	`ssh -N -f -L 8081:127.0.0.1:80 username@ip`    
 55 |    >	username和ip分别为登陆master机器的username和IP     
 56 | 2. 将本地`80`绑定到远程`80`端口    
 57 | 	`ssh -N -f -L 80:127.0.0.1:80 username@ip`       
 58 |    >	username和ip分别为登陆master机器的username和IP
 59 | 3. 将本地`8080`绑定到远程`80`端口    
 60 | 	`ssh -N -f -L 8080:127.0.0.1:80 username@ip`        
 61 | 	>	username和ip分别为登陆master机器的username和IP
 62 | 4. 将本地`4040`绑定到远程`4040`端口    
 63 | 		`ssh -N -f -L 4040:127.0.0.1:4040 username@ip`    
 64 | 	>	username和ip分别为登陆master机器的username和IP
 65 |  
 66 | ####	windows下 Putty的配置
 67 | 1. 将本地 8081 绑定到远程 80 端口    
 68 | 	结合上图切换到 Tunnel对应的选项卡：    
 69 | 	*	source port填写 8081
 70 | 	*	Destination 填写 127.0.0.1:80
 71 | 2. 将本地 80 绑定到远程 80 端口
 72 |    结合上图切换到 Tunnel对应的选项卡：    
 73 | 	*	source port填写 80
 74 | 	*	Destination 填写 127.0.0.1:80
 75 | 3. 将本地 8080 绑定到远程 80 端口
 76 | 	结合上图切换到 Tunnel对应的选项卡:     
 77 | 	*	source port填写 8080
 78 | 	*	Destination 填写 127.0.0.1:80
 79 | 4. 将本地 4040 绑定到远程 4040 端口
 80 | 	结合上图切换到 Tunnel对应的选项卡：    
 81 | 	*	source port填写 4040
 82 | 	*	Destination 填写 127.0.0.1:4040
 83 |     
 84 | ####	将Spark master和所有slave的机器名绑定127.0.0.1
 85 | 
 86 | 如： `127.0.0.1 23		hxs787e`
 87 | 
 88 | *	windows hosts文件路径: `C:\Windows\System32\drivers\etc\hosts`
 89 | *	linux hosts文件路径: `/etc/hosts`
 90 | 
 91 | ####	Spark-notebook的配置：
 92 | 	
 93 | 1.	Linux命令行执行如下命令：
 94 | `ssh -N -f -L port1:127.0.0.1:9090 username@ip`
 95 | *username和ip分别为登陆master机器的username和IP，port1的值为与本机其他端口不冲突的任意有效值*
 96 | 2. windows Putty的配置：
 97 | 结合上图切换到 Tunnel对应的选项卡：    
 98 | 	*	source port填写与本机其他端口不冲突的任意有效值
 99 | 	*	Destination 填写 127.0.0.1:9090
100 | 
101 | ####	配置Hue
102 | 
103 | 1.	Linux命令行执行如下命令：
104 | `ssh -N -f -L port1:127.0.0.1:8888 username@ip`
105 | *username和ip分别为登陆master机器的username和IP，port1的值为与本机其他端口不冲突的任意有效值*
106 | 
107 | 2.	windows Putty的配置
108 | 结合上图切换到 Tunnel对应的选项卡：     
109 | 	*	source port填写与本机其他端口不冲突的任意有效值
110 | 	*	Destination 填写 127.0.0.1:8888


--------------------------------------------------------------------------------
/bin/core/config_nginx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding=utf-8
 3 | import os
 4 | import sys
 5 | 
 6 | def do_generate_upstream_server_config(spark_host_info_path):
 7 | 
 8 |     format_tab = "\t"
 9 |     format_tab2 = format_tab*2
10 |     up_stream_place_template = format_tab+"upstream server_${hostname} {"+os.linesep + \
11 |                           format_tab2 + "server ${host}:${port};" + os.linesep + \
12 |                           format_tab + "}"
13 |     server_place_holder_template = format_tab+"server {" + os.linesep + \
14 |                               format_tab2 + "listen 80;" + os.linesep + \
15 |                               format_tab2 + "server_name ${hostname};"+os.linesep + \
16 |                               format_tab2 + "location / {" + os.linesep + \
17 |                               format_tab2 + "        proxy_pass http://server_${hostname};" + os.linesep + \
18 |                               format_tab2 + "}"+os.linesep + \
19 |                               format_tab + "}"
20 |     spark_host_info_file = open(spark_host_info_path)
21 |     host_info_lines = spark_host_info_file.readlines()[1:]
22 | 
23 |     up_stream_str = ""
24 |     server_stream_str = ""
25 | 
26 |     spark_master_host_name = "spark_master"
27 |     up_stream_master_item = up_stream_place_template.replace("${hostname}", spark_master_host_name)\
28 |                                             .replace("${host}", "127.0.0.1")\
29 |                                             .replace("${port}", "8080").replace("\t", "", 1)
30 |     server_stream_master_item = \
31 |         server_place_holder_template.replace("${hostname}", spark_master_host_name).replace("\t", "", 1)
32 | 
33 |     up_stream_str += up_stream_master_item + os.linesep
34 |     server_stream_str += server_stream_master_item+os.linesep
35 | 
36 |     for host_info in host_info_lines:
37 |         host_info_list = host_info.split()
38 |         up_stream_item = up_stream_place_template.replace("${hostname}", host_info_list[1].strip()) \
39 |                                             .replace("${host}", host_info_list[0].strip()) \
40 |                                             .replace("${port}", "8081")
41 |         server_stream_item = server_place_holder_template.replace("${hostname}", host_info_list[1].strip())
42 | 
43 |         up_stream_str += up_stream_item.rstrip() + os.linesep
44 |         server_stream_str += server_stream_item.rstrip()+os.linesep
45 |     return up_stream_str, server_stream_str
46 | 
47 | def do_update_nginx_config_file(result_content, nginx_config_target_path):
48 |     nginx_config_file = file(nginx_config_target_path, "w")
49 |     nginx_config_file.write(result_content)
50 | 
51 | def generate_config_file(spark_host_info_path,nginx_config_template_path, nginx_config_taget_path):
52 | 
53 |      up_stream_place_holder="${upstream_place_holder}"
54 |      server_place_holder="${server_place_holder}"
55 | 
56 |      nginx_upstream_server_tuple = do_generate_upstream_server_config(spark_host_info_path)
57 | 
58 |      nginx_config_template_file = open(nginx_config_template_path)
59 |      nginx_config_template_lines = nginx_config_template_file.readlines()
60 |      result_content = ""
61 |      for line in nginx_config_template_lines:
62 |         result_content += line
63 |     
64 |      result_content = result_content.replace(up_stream_place_holder, nginx_upstream_server_tuple[0]) \
65 |                   .replace(server_place_holder, nginx_upstream_server_tuple[1])
66 |     
67 |      do_update_nginx_config_file(result_content, nginx_config_taget_path)
68 | 
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                The Artistic License 2.0
  2 | 
  3 |            Copyright (c) 2015 aliyun
  4 | 
  5 |      Everyone is permitted to copy and distribute verbatim copies
  6 |       of this license document, but changing it is not allowed.
  7 | 
  8 | Preamble
  9 | 
 10 | This license establishes the terms under which a given free software
 11 | Package may be copied, modified, distributed, and/or redistributed.
 12 | The intent is that the Copyright Holder maintains some artistic
 13 | control over the development of that Package while still keeping the
 14 | Package available as open source and free software.
 15 | 
 16 | You are always permitted to make arrangements wholly outside of this
 17 | license directly with the Copyright Holder of a given Package.  If the
 18 | terms of this license do not permit the full use that you propose to
 19 | make of the Package, you should contact the Copyright Holder and seek
 20 | a different licensing arrangement.
 21 | 
 22 | Definitions
 23 | 
 24 |     "Copyright Holder" means the individual(s) or organization(s)
 25 |     named in the copyright notice for the entire Package.
 26 | 
 27 |     "Contributor" means any party that has contributed code or other
 28 |     material to the Package, in accordance with the Copyright Holder's
 29 |     procedures.
 30 | 
 31 |     "You" and "your" means any person who would like to copy,
 32 |     distribute, or modify the Package.
 33 | 
 34 |     "Package" means the collection of files distributed by the
 35 |     Copyright Holder, and derivatives of that collection and/or of
 36 |     those files. A given Package may consist of either the Standard
 37 |     Version, or a Modified Version.
 38 | 
 39 |     "Distribute" means providing a copy of the Package or making it
 40 |     accessible to anyone else, or in the case of a company or
 41 |     organization, to others outside of your company or organization.
 42 | 
 43 |     "Distributor Fee" means any fee that you charge for Distributing
 44 |     this Package or providing support for this Package to another
 45 |     party.  It does not mean licensing fees.
 46 | 
 47 |     "Standard Version" refers to the Package if it has not been
 48 |     modified, or has been modified only in ways explicitly requested
 49 |     by the Copyright Holder.
 50 | 
 51 |     "Modified Version" means the Package, if it has been changed, and
 52 |     such changes were not explicitly requested by the Copyright
 53 |     Holder.
 54 | 
 55 |     "Original License" means this Artistic License as Distributed with
 56 |     the Standard Version of the Package, in its current version or as
 57 |     it may be modified by The Perl Foundation in the future.
 58 | 
 59 |     "Source" form means the source code, documentation source, and
 60 |     configuration files for the Package.
 61 | 
 62 |     "Compiled" form means the compiled bytecode, object code, binary,
 63 |     or any other form resulting from mechanical transformation or
 64 |     translation of the Source form.
 65 | 
 66 | 
 67 | Permission for Use and Modification Without Distribution
 68 | 
 69 | (1)  You are permitted to use the Standard Version and create and use
 70 | Modified Versions for any purpose without restriction, provided that
 71 | you do not Distribute the Modified Version.
 72 | 
 73 | 
 74 | Permissions for Redistribution of the Standard Version
 75 | 
 76 | (2)  You may Distribute verbatim copies of the Source form of the
 77 | Standard Version of this Package in any medium without restriction,
 78 | either gratis or for a Distributor Fee, provided that you duplicate
 79 | all of the original copyright notices and associated disclaimers.  At
 80 | your discretion, such verbatim copies may or may not include a
 81 | Compiled form of the Package.
 82 | 
 83 | (3)  You may apply any bug fixes, portability changes, and other
 84 | modifications made available from the Copyright Holder.  The resulting
 85 | Package will still be considered the Standard Version, and as such
 86 | will be subject to the Original License.
 87 | 
 88 | 
 89 | Distribution of Modified Versions of the Package as Source
 90 | 
 91 | (4)  You may Distribute your Modified Version as Source (either gratis
 92 | or for a Distributor Fee, and with or without a Compiled form of the
 93 | Modified Version) provided that you clearly document how it differs
 94 | from the Standard Version, including, but not limited to, documenting
 95 | any non-standard features, executables, or modules, and provided that
 96 | you do at least ONE of the following:
 97 | 
 98 |     (a)  make the Modified Version available to the Copyright Holder
 99 |     of the Standard Version, under the Original License, so that the
100 |     Copyright Holder may include your modifications in the Standard
101 |     Version.
102 | 
103 |     (b)  ensure that installation of your Modified Version does not
104 |     prevent the user installing or running the Standard Version. In
105 |     addition, the Modified Version must bear a name that is different
106 |     from the name of the Standard Version.
107 | 
108 |     (c)  allow anyone who receives a copy of the Modified Version to
109 |     make the Source form of the Modified Version available to others
110 |     under
111 | 
112 |     (i)  the Original License or
113 | 
114 |     (ii)  a license that permits the licensee to freely copy,
115 |     modify and redistribute the Modified Version using the same
116 |     licensing terms that apply to the copy that the licensee
117 |     received, and requires that the Source form of the Modified
118 |     Version, and of any works derived from it, be made freely
119 |     available in that license fees are prohibited but Distributor
120 |     Fees are allowed.
121 | 
122 | 
123 | Distribution of Compiled Forms of the Standard Version
124 | or Modified Versions without the Source
125 | 
126 | (5)  You may Distribute Compiled forms of the Standard Version without
127 | the Source, provided that you include complete instructions on how to
128 | get the Source of the Standard Version.  Such instructions must be
129 | valid at the time of your distribution.  If these instructions, at any
130 | time while you are carrying out such distribution, become invalid, you
131 | must provide new instructions on demand or cease further distribution.
132 | If you provide valid instructions or cease distribution within thirty
133 | days after you become aware that the instructions are invalid, then
134 | you do not forfeit any of your rights under this license.
135 | 
136 | (6)  You may Distribute a Modified Version in Compiled form without
137 | the Source, provided that you comply with Section 4 with respect to
138 | the Source of the Modified Version.
139 | 
140 | 
141 | Aggregating or Linking the Package
142 | 
143 | (7)  You may aggregate the Package (either the Standard Version or
144 | Modified Version) with other packages and Distribute the resulting
145 | aggregation provided that you do not charge a licensing fee for the
146 | Package.  Distributor Fees are permitted, and licensing fees for other
147 | components in the aggregation are permitted. The terms of this license
148 | apply to the use and Distribution of the Standard or Modified Versions
149 | as included in the aggregation.
150 | 
151 | (8) You are permitted to link Modified and Standard Versions with
152 | other works, to embed the Package in a larger work of your own, or to
153 | build stand-alone binary or bytecode versions of applications that
154 | include the Package, and Distribute the result without restriction,
155 | provided the result does not expose a direct interface to the Package.
156 | 
157 | 
158 | Items That are Not Considered Part of a Modified Version
159 | 
160 | (9) Works (including, but not limited to, modules and scripts) that
161 | merely extend or make use of the Package, do not, by themselves, cause
162 | the Package to be a Modified Version.  In addition, such works are not
163 | considered parts of the Package itself, and are not subject to the
164 | terms of this license.
165 | 
166 | 
167 | General Provisions
168 | 
169 | (10)  Any use, modification, and distribution of the Standard or
170 | Modified Versions is governed by this Artistic License. By using,
171 | modifying or distributing the Package, you accept this license. Do not
172 | use, modify, or distribute the Package, if you do not accept this
173 | license.
174 | 
175 | (11)  If your Modified Version has been derived from a Modified
176 | Version made by someone other than you, you are nevertheless required
177 | to ensure that your Modified Version complies with the requirements of
178 | this license.
179 | 
180 | (12)  This license does not grant you the right to use any trademark,
181 | service mark, tradename, or logo of the Copyright Holder.
182 | 
183 | (13)  This license includes the non-exclusive, worldwide,
184 | free-of-charge patent license to make, have made, use, offer to sell,
185 | sell, import and otherwise transfer the Package with respect to any
186 | patent claims licensable by the Copyright Holder that are necessarily
187 | infringed by the Package. If you institute patent litigation
188 | (including a cross-claim or counterclaim) against any party alleging
189 | that the Package constitutes direct or contributory patent
190 | infringement, then this Artistic License to you shall terminate on the
191 | date that such litigation is filed.
192 | 
193 | (14)  Disclaimer of Warranty:
194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
202 | 
203 | 


--------------------------------------------------------------------------------
/bin/core/ecs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #coding=utf-8
  3 | import sys
  4 | import os
  5 | import tarfile
  6 | import time
  7 | import commands
  8 | import urllib2
  9 | import utils
 10 | from sys import stderr
 11 | from datetime import datetime
 12 | from common import GlobalVar
 13 | 
 14 | def setup_aliyun_sdk():
 15 |     lib_dir = os.path.join(GlobalVar.SPARK_ECS_DIR, "lib")
 16 |     if not os.path.exists(lib_dir):
 17 |         os.mkdir(lib_dir)
 18 |     ecs_sdk_lib_dir = os.path.join(lib_dir, "aliyun-sdk")
 19 |     if not os.path.isdir(ecs_sdk_lib_dir):
 20 |         tgz_file_path = os.path.join(lib_dir, "aliyun-sdk.tgz")
 21 |         print "Downloading Aliyun sdk..."
 22 |         download_stream = urllib2.urlopen(GlobalVar.ALIYUN_SDK_URL)
 23 |         with open(tgz_file_path, "wb") as tgz_file:
 24 |             tgz_file.write(download_stream.read())
 25 |         tar = tarfile.open(tgz_file_path)
 26 |         tar.extractall(path=lib_dir)
 27 |         tar.close()
 28 |         os.remove(tgz_file_path)
 29 |         os.system("mv %s/* %s/aliyun-sdk" % (lib_dir, lib_dir))
 30 |         print "Finished downloading Aliyun sdk"
 31 |     sys.path.insert(0, ecs_sdk_lib_dir)
 32 | 
 33 | setup_aliyun_sdk()
 34 | import aliyun.api
 35 | 
 36 | def set_secret_key():
 37 |     access_id = os.getenv('ALIYUN_ACCESS_ID')
 38 |     if access_id is None:
 39 |         print >> stderr, ("ERROR: The environment variable ALIYUN_ACCESS_ID must be set")
 40 |         sys.exit(1)
 41 |     access_key = os.getenv('ALIYUN_ACCESS_KEY')
 42 |     if access_key is None:
 43 |         print >> stderr, ("ERROR: The environment variable ALIYUN_ACCESS_KEY must be set")
 44 |     aliyun.setDefaultAppInfo(access_id, access_key)
 45 | 
 46 | set_secret_key()
 47 | 
 48 | def check_aliyun_api_ret_code(response):
 49 |     if "Code" in response:
 50 |         print "Fail."
 51 |         print response['Code']
 52 |         print response['Message']
 53 |         raise RuntimeError(response['Code'], response['Message'])
 54 | 
 55 | def authorize_security_group_in(group_id, ip_protocol, src_group_id, src_cidr_ip, port_range, opts):
 56 |     req = aliyun.api.Ecs20140526AuthorizeSecurityGroupRequest()
 57 |     req.RegionId = opts.region
 58 |     req.SecurityGroupId = group_id
 59 |     req.IpProtocol = ip_protocol
 60 |     req.PortRange = port_range
 61 |     if src_cidr_ip == "":
 62 |         req.SourceGroupId = src_group_id
 63 |         req.NicType = "intranet"
 64 |     else:
 65 |         req.SourceCidrIp = src_cidr_ip
 66 |         req.NicType = "internet"
 67 |     f = req.getResponse()
 68 |     check_aliyun_api_ret_code(f)
 69 | 
 70 | def authorize_security_group_out(group_id, ip_protocol, dst_group_id, dst_cidr_ip, port_range, opts):
 71 |     req = aliyun.api.Ecs20140526AuthorizeSecurityGroupEgressRequest()
 72 |     req.RegionId = opts.region
 73 |     req.SecurityGroupId = group_id
 74 |     req.IpProtocol = ip_protocol
 75 |     req.PortRange = port_range
 76 |     if dst_cidr_ip == "":
 77 |         req.DestGroupId = dst_group_id
 78 |         req.NicType = "intranet"
 79 |     else:
 80 |         req.DestCidrIp = dst_cidr_ip
 81 |         req.NicType = "internet"
 82 |     f = req.getResponse()
 83 |     check_aliyun_api_ret_code(f)
 84 | 
 85 | def get_security_group_rules(security_group_id, opts):
 86 |     req = aliyun.api.Ecs20140526DescribeSecurityGroupAttributeRequest()
 87 |     req.SecurityGroupId = security_group_id
 88 |     req.RegionId = opts.region
 89 |     f = req.getResponse()
 90 |     check_aliyun_api_ret_code(f)
 91 |     permissions = f['Permissions']['Permission']
 92 |     return permissions
 93 | 
 94 | def get_all_instances(opts):
 95 |     page_number = 1
 96 |     instances = []
 97 |     req = aliyun.api.Ecs20140526DescribeInstancesRequest()
 98 |     req.RegionId = opts.region
 99 |     req.PageSize = GlobalVar.ECS_API_PAGESIZE
100 |     req.PageNumber = page_number
101 |     f = req.getResponse()
102 |     check_aliyun_api_ret_code(f)
103 |     instances += f['Instances']['Instance']
104 |     total_pages = f['TotalCount'] / (GlobalVar.ECS_API_PAGESIZE + 1) + 1
105 |     while page_number < total_pages:
106 |         page_number += 1
107 |         req = aliyun.api.Ecs20140526DescribeInstancesRequest()
108 |         req.RegionId = opts.region
109 |         req.PageSize = GlobalVar.ECS_API_PAGESIZE
110 |         req.PageNumber = page_number
111 |         f = req.getResponse()
112 |         check_aliyun_api_ret_code(f)
113 |         instances += f['Instances']['Instance']
114 | 
115 |     return instances
116 | 
117 | def get_gateway_instance_info(opts):
118 |     ip = commands.getoutput("""ifconfig eth0 | awk 'NR==2 {print $2}' | awk -F'[:]' '{print $2}'""")
119 |     all_instances = get_all_instances(opts)
120 |     for ins in all_instances:
121 |         inner_ips = ins['InnerIpAddress']['IpAddress']
122 |         public_ips = ins['PublicIpAddress']['IpAddress']
123 |         if ip in inner_ips + public_ips:
124 |             return ins
125 |     raise RuntimeError('Could find instance information of the current gateway.')
126 | 
127 | def clear_security_group_rules(group_id, opts):
128 |     security_group_rules = get_security_group_rules(group_id, opts)
129 |     for rule in security_group_rules:
130 |         if rule['SourceGroupId'] != "" or rule['SourceCidrIp'] != "":
131 |             req = aliyun.api.Ecs20140526RevokeSecurityGroupRequest()
132 |             req.SourceGroupId = rule['SourceGroupId']
133 |             req.SourceCidrIp = rule['SourceCidrIp']
134 |         else:
135 |             req = aliyun.api.Ecs20140526RevokeSecurityGroupEgressRequest()
136 |             req.DestGroupId = rule['DestGroupId']
137 |             req.DestCidrIp = rule['DestCidrIp']
138 |         req.SecurityGroupId = group_id
139 |         req.RegionId = opts.region
140 |         req.IpProtocol = rule['IpProtocol']
141 |         req.PortRange = rule['PortRange']
142 |         f = req.getResponse()
143 |         check_aliyun_api_ret_code(f)
144 | 
145 | def launch_instance(opts, cluster_name, role, ami, instance_type, security_group_id, instance_name,
146 |                     internet_band_out, host_name, pass_word, open_public_ip=False):
147 |     req = aliyun.api.Ecs20140526CreateInstanceRequest()
148 |     req.RegionId = opts.region
149 |     req.ImageId = ami
150 |     req.InstanceType = instance_type
151 |     req.SecurityGroupId = security_group_id
152 |     req.InstanceName = instance_name
153 |     req.HostName = host_name.replace('_', '-')
154 |     req.Password = pass_word
155 |     if role == "masters" or open_public_ip:
156 |         req.InternetChargeType = "PayByTraffic"
157 |         req.InternetMaxBandwidthOut = internet_band_out
158 |     else:
159 |         req.InternetChargeType = "PayByBandwidth"
160 |         req.InternetMaxBandwidthOut = "0"
161 |     if opts.disk_size is not None:
162 |         req.DataDisk_1_Category = "cloud"
163 |         req.DataDisk_1_Device = "/dev/xvdb"
164 |         req.DataDisk_1_Size = opts.disk_size
165 | 
166 |     req2 = aliyun.api.Ecs20140526StartInstanceRequest()
167 |     req3 = aliyun.api.Ecs20140526AllocatePublicIpAddressRequest()
168 | 
169 |     f = req.getResponse()
170 |     check_aliyun_api_ret_code(f)
171 |     instance_id = f['InstanceId']
172 |     utils.save_masters_or_slaves(cluster_name, role, instance_id)
173 |     if open_public_ip:
174 |         req3.InstanceId = instance_id
175 |         f = req3.getResponse()
176 |         check_aliyun_api_ret_code(f)
177 |     req2.InstanceId = instance_id
178 |     f = req2.getResponse()
179 |     check_aliyun_api_ret_code(f)
180 |     return instance_id
181 | 
182 | def release_ecs_instance(instance_ids):
183 |     print("Terminating masters and slaves...")
184 |     start_time = datetime.now()
185 | 
186 |     print "==> Checking cluster status. We can do noting before the cluster enter `Running` status..."
187 |     utils.wait_for_cluster_state(['Running', 'Stopping', 'Stopped'], instance_ids)
188 |     print "==> Checked OK..."
189 | 
190 |     need_to_stop = []
191 |     need_to_release = []
192 |     for ins in instance_ids:
193 |         instance_info = get_instance_info(ins)
194 |         status = instance_info['Status']
195 |         if status in ['Running']:
196 |             need_to_stop.append(ins)
197 |             need_to_release.append(ins)
198 |         elif status in ['Stopped', 'Stopping']:
199 |             need_to_release.append(ins)
200 | 
201 |     for ins in need_to_stop:
202 |         try:
203 |             req = aliyun.api.Ecs20140526StopInstanceRequest()
204 |             req.InstanceId = ins
205 |             f = req.getResponse()
206 |             check_aliyun_api_ret_code(f)
207 |         except Exception, e:
208 |             print e
209 |             raise e
210 | 
211 |     retries = 0
212 |     while True:
213 |         time.sleep(5)
214 |         all_released = True
215 | 
216 |         for ins in need_to_release:
217 |             try:
218 |                 instance_info = get_instance_info(ins)
219 |                 if instance_info['Status'] == "Stopped":
220 |                     req2 = aliyun.api.Ecs20140526DeleteInstanceRequest()
221 |                     req2.InstanceId = ins
222 |                     f = req2.getResponse()
223 |                     check_aliyun_api_ret_code(f)
224 |                 elif instance_info['Status'] in ["Running", "Stopping"]:
225 |                     all_released = False
226 |             except Exception, e:
227 |                 print >> stderr, "Error releasing ECS instance, retrying later."
228 |                 time.sleep(5)
229 |                 if retries >= 10:
230 |                     raise e
231 | 
232 |         if all_released:
233 |             break
234 | 
235 |         retries += 1
236 | 
237 |     end_time = datetime.now()
238 |     print "Cluster instances have been released successfully. Waited {t} seconds.".format(
239 |         t=(end_time - start_time).seconds
240 |     )
241 | 
242 | def get_all_instances_status(instances):
243 |     all_instances_status = []
244 |     for ins in instances:
245 |         instance_status = get_instance_info(ins)['Status']
246 |         all_instances_status.append(instance_status)
247 |     return all_instances_status
248 | 
249 | def get_instance_info(instance_id):
250 |     req = aliyun.api.Ecs20140526DescribeInstanceAttributeRequest()
251 |     req.InstanceId = instance_id
252 |     f = req.getResponse()
253 |     check_aliyun_api_ret_code(f)
254 |     return f
255 | 
256 | 


--------------------------------------------------------------------------------
/doc/manual.md:
--------------------------------------------------------------------------------
  1 | # Spark On ECS 
  2 | v0.2      
  3 | 2015.6.30
  4 | 
  5 | 
  6 | ## Prepare
  7 | -------------
  8 | ### 三种工作模式
  9 | 脚本工作在三种不同的模式下，下面会介绍三种不同的模式：
 10 | * cluster + gateway exclude模式    
 11 |     需要先申请一台具有公网访问能力的ECS机器作为gateway，然后脚本会自动创建一个新的master和多台slaves，最终这个gateway机器不会成为集群的一部分。
 12 | * cluster + gateway include模式    
 13 |     需要先申请一台具有公网访问能力的ECS机器作为gateway。这台机器会作为集群的master存在，脚本会创建其余的slaves。
 14 | * client 模式    
 15 |     用户可以自行在ECS的购买页面上先行购买好所有的机器，（但是需要使用我们的Spark环境的镜像，此外机器的密码目前需要都一样）。然后在其中一台具有公网访问能力的机器上配置机器信息的配置文件，脚本会读取配置并负责环境的启动。
 16 | 
 17 | 
 18 | ## Quick Start
 19 | 
 20 | ####    cluster + gateway exclude模式
 21 | -------------
 22 | 
 23 | ### 1. 选购Gateway    
 24 | 在[阿里云ECS](http://www.aliyun.com/product/ecs/)购买一台ECS实例作为Gateway，用来执行自动化部署脚本。
 25 | 
 26 | *   Gateway需要配置公网IP，默认不作为Spark集群的一部分，Gateway可以用低配
 27 | *	Gateway所在地域默认为spark cluster的地域(Region)
 28 | 
 29 | ### 2. 配置环境变量    
 30 | 
 31 | 从[AccessKey管理](https://ak-console.aliyun.com/#/accesskey)获得阿里云API公钥密钥。在gateway上配置环境变量:    
 32 |  *ALIYUN_ACCESS_ID*和*ALIYUN_ACCESS_KEY*
 33 | 
 34 | ```               
 35 |   export ALIYUN_ACCESS_ID=HAxxxxxxxxxx2     
 36 |   export ALIYUN_ACCESS_KEY=JAxxxxxxxxxxxxxxxxxxxxxxxxxs       
 37 | ```    
 38 | *考虑到安全性, 推荐每次登陆时在当前会话中设置环境变量; 出于方便也可以在.bash_profile中配置(不推荐)*  
 39 | 
 40 | ###	3.	执行脚本, 启动spark集群    
 41 | 
 42 | - 在geteway上执行： **`python spark_ecs.py --mode=cluster -t ecs.s2.large launch spark-test`**
 43 | 
 44 | - 购买前会有一个Check List，列出您购买的ECS实例配置和个数，如下：
 45 | 
 46 | ```
 47 | +--------------------------------------------------------+
 48 | +                   Check   List                         +
 49 | +--------------------------------------------------------+
 50 | 
 51 |  Running Mode:                      cluster
 52 | 
 53 |  Master Instance:
 54 |        Number:                      1
 55 |        Region:                      cn-hangzhou
 56 |        Zone:                        cn-hangzhou-d
 57 |        Cores:                       2
 58 |        Memory:                      4G
 59 |        InstanceType:                ecs.s2.large
 60 |        InternetChargeType:          PayByTraffic
 61 |        InternetMaxBandwidthOut:     2
 62 |        
 63 | 
 64 |  Slave Instance:
 65 |        Number:                      1
 66 |        Region:                      cn-hangzhou
 67 |        Zone:                        cn-hangzhou-d
 68 |        Cores:                       2
 69 |        Memory:                      4G
 70 |        InstanceType:                ecs.s2.large
 71 |        InternetChargeType:          PayByBandwidth
 72 |        InternetMaxBandwidthOut:     0
 73 | +--------------------------------------------------------+
 74 |             
 75 | ```	
 76 | 这里会看到所有的生成的实例的信息，比如
 77 | 
 78 | * Number 对应节点的数量
 79 | * Region 表示所在的region
 80 | * Zone 所在的zone
 81 | * Image 使用的镜像的id
 82 | * Cores 机器的核数的配置，目前所有的master和slaves的配置都是一样的
 83 | * Memory 使用的内容，目前所有的master和slaves的配置都是一样的
 84 | * InstanceType 这个是ECS的官方机型缩略代号
 85 | * SecurityGroup 机器所在的安全组，一般同一个集群的会在同一个安全组内
 86 | * InternetChargeType 公网流量的付费方式，按量和按带宽
 87 | * InternetMaxBandwidthOut 带宽大小
 88 | 
 89 | 
 90 | 启动完，会打印出Spark集群所有服务的简要信息，如下：
 91 | 
 92 | ```
 93 | +--------------------------------------------------------+
 94 | +        Spark Cluster Started Successfully!             +
 95 | +--------------------------------------------------------+
 96 | The Spark Cluster Configuration listed as following:
 97 | 
 98 |     Spark Cluster:
 99 | 
100 |         Spark UI:  http://xxx.xxx.xxx.xxx:8080
101 |         Master URL: spark://spark-test-master:7077      
102 | 
103 | +--------------------------------------------------------+	
104 | ```
105 | - 到这里Spark Cluster就完全起来了, 下面可以愉快的跑spark任务了。
106 | 
107 | ###	4.  Spark Sample Test
108 | - 登陆到spark master: *ssh xxx.xxx.xxx.xxx*， master ip可以根据上面的成功启动的信息里面找到
109 | - 执行:  `/opt/spark/bin/run-example SparkPi`, 测试spark任务能否跑成功。
110 | 
111 | ###	5. 停止Spark Cluster和释放ECS
112 | 登陆到gateway上:
113 | 
114 | *	停止spark cluster: `spark_ecs.py --mode=cluster stop spark-test`
115 | *	启动spark cluster: `spark_ecs.py --mode=cluster start spark-test`
116 | *	释放ECS资源: `spark_ecs.py --mode=cluster destroy spark-test`
117 | 
118 | ##	cluster + gateway include模式
119 | 基本上cluster gateway exclude模式一样，以下的几部需要注意
120 | ###	3.	执行脚本, 启动spark集群    
121 | 
122 | - 执行： **`python spark_ecs.py --mode=cluster --include-gateway -t ecs.s2.large launch spark-test`**
123 | - 需要注意的是新申请的slaves的密码需要和已有的master一致
124 | 
125 | ###	4.  Spark Sample Test
126 | - 由于本机就是master，所以可以直接执行:  `/opt/spark/bin/run-example SparkPi`, 测试spark任务能否跑成功。
127 | 
128 | ###	5. 停止Spark Cluster和释放ECS
129 | 由于本机就是master，直接在master机器上执行
130 | *	停止spark cluster: `spark_ecs.py --mode=cluster stop spark-test`
131 | *	启动spark cluster: `spark_ecs.py --mode=cluster start spark-test`
132 | *	释放ECS资源: `spark_ecs.py --mode=cluster destroy spark-test`
133 | 
134 | ##	client模式
135 | ### 1. 选购集群机器
136 | 不再需要选购gateway，取而代之的是，需要在ECS购买页面上购买好所有的机器，包括master和slaves
137 | 
138 | ### 2. 配置环境变量    
139 | 从[AccessKey管理](https://ak-console.aliyun.com/#/accesskey)获得阿里云API公钥密钥。在master上配置环境变量: *ALIYUN_ACCESS_ID*和*ALIYUN_ACCESS_KEY*
140 | 
141 | ```               
142 |   export ALIYUN_ACCESS_ID=HAxxxxxxxxxx2     
143 |   export ALIYUN_ACCESS_KEY=JAxxxxxxxxxxxxxxxxxxxxxxxxxs       
144 | ```    
145 | *考虑到安全性, 推荐每次登陆时在当前会话中设置环境变量; 出于方便也可以在.bash_profile中配置(不推荐)*
146 | 
147 | ###	3.	脚本下载    
148 | 
149 | 下载地址: [此处](), 将脚本拷贝到master任意目录下,例如`$HOME/spark`    
150 | 并在脚本目录下创建master和slaves文件    
151 | master内将要作为master机器的instance id写进去    
152 | 一行一个id，类似
153 | ```
154 | i-m32135678d
155 | ```
156 | slaves内将要作为slaves机器的instance id(instance id 可以在ECS的实例列表上看到。)写进去
157 | 一行一个id，类似
158 | ```
159 | i-m12563538d
160 | i-m12332678d
161 | i-m46745678d
162 | ```
163 | 
164 | ###	4.	执行脚本, 启动spark集群    
165 | 
166 | - 在master上执行： **`python spark_ecs.py --mode=client launch spark-test`**
167 | 
168 | ###	6. 停止Spark Cluster和释放ECS
169 | 在master上:
170 | 
171 | *	停止spark cluster: `spark_ecs.py --mode=client stop spark-test`
172 | *	启动spark cluster: `spark_ecs.py --mode=client start spark-test`
173 | *	释放ECS资源: `spark_ecs.py --mode=client destroy spark-test`
174 | 
175 | **脚本的更多参数设置见下面的用户手册。**
176 | 
177 | ##		Manual
178 | -------------
179 | 
180 | **Usage: `spark-ecs [options] <action> <cluster_name>[:<module_name>]`**    
181 | **`<action>`可以是: launch, destroy, stop, start, enable, disable**     
182 | **`<module_name>`可以是: hdfs, hue, spark-notebook**
183 | 
184 | 启动Spark集群格式: `python spark-ecs.py -t <ecs-instance-type> -i <ecs-image-id> -s <num-slaves> -p <password> launch <cluster-name>`
185 | 
186 | 例如: `python spark-ecs.py -t ecs.s2.large -i m-xxxxxxx5j -s 2 -p xxxxxx launch test`
187 | 
188 | 启动单独服务格式：`python spark-ecs.py enable <cluster-name>:<module_name>`
189 | 
190 | 例如： `python spark-ecs.py enable test:hdfs`
191 | 
192 | ###	命令描述
193 | 
194 | 命令 | 参数 | 描述
195 | ----|---- | ----
196 | launch|集群名字|创建并启动一个Spark集群 
197 | destroy|集群名字|销毁Spark集群,并释放集群中所有ECS实例, **集群数据将无法恢复，请及时转移重要数据**。销毁后集群ECS实例将停止收取相关费用
198 | stop|集群名字|停止Saprk集群，集群实例不会被释放，集群中数据不会丢失。**集群ECS实例将继续收取相关费用**
199 | start|集群名字|再次启动Spark集群
200 | enable|子服务名|启用一个子服务，例如hdfs，hue或者spark-notebook
201 | disable|子服务名|关闭一个子服务，例如hdfs，hue或者spark-notebook
202 | 
203 | 脚本执行完会打印出:
204 | 
205 | *	Spark UI地址: `http://<master-ip>:8080`
206 | *	Spark Master: `spark://<master-hostname>:7077`
207 | *	Spark Notebook(可选): `http://<master-ip>:9090`
208 | *	Hue(可选): `http://<master-ip>:8888`
209 | 
210 | 访问Spark UI检查所有的slave节点是否正常启动。Spark UI的使用方式见下面的说明。
211 | 
212 | ### 脚本参数说明
213 | 
214 | 运行`python spark-ecs.py --help`查看使用帮助。以下列出主要的配置项说明：
215 | 
216 | | 参数 |缩写| 要求 | 默认值 | 描述 | 可用模式 |
217 | | ------------ | --- | ------------- | ------------ | ------------ | ----- | 
218 | |`--instance-type=<ecs-instance-type>`|-t|可选|无|配置所要创建的ECS实例类型. 更多类型见: [实例资源规格对照表](http://docs.aliyun.com/?spm=5176.730001.3.16.5mmF39#/pub/ecs/open-api/appendix&instancetype)|clueter模式有效|
219 | | `--mode=<cluster-mode>`|-m|可选|cluster|运行模式。可选有client模式和cluster模式。client模式是使用已有ECS实例；cluster模式是创建新的ECS实例 | - |
220 | | `--pwd=<password>` |-p|可选|无|配置Spark集群中每个ECS实例的默认密码|clueter模式有效| 
221 | | `--ami=<ecs-image-id>`|无|可选|无|配置阿里云ECS机器镜像ID|clueter模式有效| 
222 | |`--slaves=<num-slaves>`|-s|可选|1|配置Spark集群中Slave节点数|clueter模式有效| 
223 | |`--ibo=<max-bandwidth-out>`|无|可选|2MB|配置实例的流出的带宽上限，计费以发生的公共网络流量为依据|clueter模式有效| 
224 | |`--region=<region-id>`|-r|必选|无|配置ECS实例所属的Region. 注意:**Spark集群的ECS实例Region需要和login机器Region保持一致**|clueter模式有效| 
225 | |`--zone=<zone-id>`|-z|可选|*cn-hangzhou-d*|配置ECS实例所属可用区|clueter模式有效| 
226 | |`--include-gateway`|无 |可选|不包含|是否将当前登录机器包含进Spark集群|clueter模式有效|
227 | |`--enable-slave-public-ip`|无|可选|不配置|是否配置Spark Slave节点的公网IP|clueter模式有效|
228 | |`--enable-hdfs`|无|可选|不开启|是否打开HDFS服务|两种模式有效| 
229 | |`--enable-hue`|无|可选|不开启|是否打开HUE服务|两种模式有效| 
230 | |`--enable-spark-notebook`|无|可选|不开启|是否打开Spark Notebook服务|两种模式有效| 
231 | 
232 | **注意点:**
233 | 
234 | 1. client模式时，一些参数无效，请注意每个参数的可用模式
235 | 2. cluster模式时，您可以选择是否将当前login机器加入到Spark集群中，详见`--include-gateway`参数。        
236 | 3. 不同可用区之间的数据传输需要收取公网流量费用：**￥0.8/GB**
237 | 
238 | ###	模式参数说明
239 | 
240 | 1. cluster模式
241 | 	ECS实例的申请，集群和服务的启动完全通过脚本完成。
242 | 2. client模式
243 | 	基于用户已有ECS实例，完成集群和服务的启动。    
244 | 	通过阿里云的售卖页面完成ECS实例的购买可以更加直观地获得费用信息。client模式需要提供两个文件`masters`和`slaves`，分别包含Master节点和Slave节点的实例ID，即`InstanceId`。可以在[ECS控制台](https://console.aliyun.com/ecs/index.htm)查看每个ECS实例的`InstanceId`。
245 | 
246 | **注意点:**
247 | 
248 | 1. `masters`和`slaves`必须和脚本本放在同一目录中   
249 | 2. 使用client模式时，您需要注意购买ECS实例时选择我们提供的镜像并设置相同的默认密码，具体可参考[Spark镜像列表](https://github.com/aliyun/spark-on-ecs/tree/master/ecs-image-list)。
250 | 
251 | 
252 | ## Spark相关
253 | 
254 | ###	Spark UI
255 | 目前提供两种方式支持Spark UI，即SSH隧道和公网开放式两种。
256 | 
257 | 1. SSH隧道：通过在PC和Spark master节点之间的SSH隧道建立连接。这种方式安全性将会高一些，但需要您做一定的配置工作。具体操作过程请详见[SSH隧道使用指引](https://github.com/aliyun/spark-on-ecs/tree/master/doc/ssh_tunnel.md)。
258 | 2. 公网开放式：这种方式需要您在购买ECS实例时配置一个公网IP。这种方式会额外打开一些端口，例如8080，8081，9090等，安全性比SSH方式低，但使用上更加方便。
259 | 	- 脚本执行完, 会在当前目录创建Spark集群的Hosts列表文件，请把这个文件内容拷贝到本机的hosts文件中。Windows用户请编辑`C:\Windows\System32\drivers\etc\hosts`文件，Linux用户请编辑`\etc\hosts`文件
260 | 	- 由于每次创建集群的机器名和公网IP都会发生变化，所以一旦销毁集群请及时清除本机中相关的Hosts修改
261 | 
262 | **注意：** 建议使用SSH隧道方式。
263 | 
264 | ### Spark Notebook
265 | 
266 | Spark Notebook提供一种交互式的编程方式，您可以在上面进行Spark程序开发。更多信息请关注[Spark Notebook](https://github.com/andypetrella/spark-notebook)的最新进展。
267 | 
268 | ### Hue
269 | 
270 | Hue是一种开源的进行大数据分析的Web平台。更多信息请关注[Cloudera-Hue](https://github.com/cloudera/hue)的最新进展。
271 | 
272 | ### 默认配置文件
273 | 
274 | 本脚执行时会动态修改一些软件的配置文件。这些软件的默认配置文件放置在/root/.config目录下：
275 | 
276 | 1. packages.property文件：配置每个软件的安装路径
277 | 2. hadoop目录：hadoop配置文件目录，包含core-site.xml，hdfs-site.xml以及hadoop-env.sh
278 | 3. hue目录：Hue配置文件目录，包含hue.ini
279 | 


--------------------------------------------------------------------------------
/bin/spark_ecs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #coding=utf-8
  3 | import sys
  4 | import os
  5 | import getpass
  6 | from service import hdfs, hue, spark, spark_notebook
  7 | from core import utils, ecs
  8 | from core.common import GlobalVar
  9 | from sys import stderr
 10 | from optparse import OptionParser
 11 | 
 12 | utils.welcome()
 13 | 
 14 | class UsageError(Exception):
 15 |     pass
 16 | 
 17 | def parse_args():
 18 |     parser = OptionParser(
 19 |         prog="spark-ecs",
 20 |         usage="%prog [options] <action> <cluster_name>[:<module_name>]\n\n"
 21 |               + "<action> can be: launch, destroy, stop, start, enable, disable \n"
 22 |               + "<cluster_name> can be anything you want \n"
 23 |               + "<module_name> can be: hdfs, hue, spark-notebook")
 24 |     parser.add_option(
 25 |         "-m", '--mode', type="string",
 26 |         help="There are two modes, i.e. `client` and `cluster`. " +
 27 |              "In `client` mode, you need to buy ECS instances firstly, and then provide `masters` file listing " +
 28 |              "Spark master `InstanceId` and `slaves` listing Spark slave `InstanceId`. In `cluster` mode, you " +
 29 |              "can create ECS instances and start Spark cluster through this script.")
 30 |     parser.add_option(
 31 |         "-p", '--pwd', type="string", help="User password for each ECS instance.")
 32 |     parser.add_option(
 33 |         "-s", "--slaves", type="int", default=1, help="Number of slaves to launch (default: %default)")
 34 |     parser.add_option(
 35 |         "-d", "--disk-size", type="int", help="Size (in GB) of each ECS data disk")
 36 |     parser.add_option(
 37 |         "--ibo", type="string", default="2", help="Internet bandwidth out")
 38 |     parser.add_option(
 39 |         "-t", "--instance-type", type="string", help="Type of instance to launch.")
 40 |     parser.add_option(
 41 |         "-r", "--region", type="string", help="ECS region to launch instances in")
 42 |     parser.add_option(
 43 |         "-z", "--zone", type="string",
 44 |         help="Availability zone to launch instances in, or 'all' to spread " +
 45 |              "slaves across multiple (an additional RMB 0.8/Gb for bandwidth" +
 46 |              "between zones applies) (default: a single zone chosen at random)")
 47 |     parser.add_option("-i", "--ami", help="Aliyun Machine Image ID to use")
 48 |     parser.add_option(
 49 |         "-u", "--user", default="root",
 50 |         help="The SSH user you want to connect as (default: %default)")
 51 |     parser.add_option(
 52 |         "--authorized-address", type="string", default="0.0.0.0/0",
 53 |         help="Address to authorize on created security groups (default: %default)")
 54 |     parser.add_option(
 55 |         "--include-gateway", action="store_true", default=False,
 56 |         help="Whether to put current login machine into Spark Cluster."
 57 |     )
 58 |     parser.add_option(
 59 |         "--enable-slave-public-ip", action="store_true", default=False,
 60 |         help="Whether to allocate a public network IP for Spark master."
 61 |     )
 62 |     parser.add_option(
 63 |         "--enable-hdfs", action="store_true", default=False,
 64 |         help="Whether to launch a HDFS service"
 65 |     )
 66 |     parser.add_option(
 67 |         "--enable-spark-notebook", action="store_true", default=False,
 68 |         help="Launch a spark-notebook. More information: https://github.com/andypetrella/spark-notebook"
 69 |     )
 70 |     parser.add_option(
 71 |         "--enable-hue", action="store_true", default=False,
 72 |         help="Launch a Hue web Service"
 73 |     )
 74 | 
 75 |     (opts, command) = parser.parse_args()
 76 |     if len(command) != 2:
 77 |         parser.print_help()
 78 |         print "\nYou need to provide a <cluster_name>[:<module_name>]\n"
 79 |         sys.exit(1)
 80 |     (action, name) = command
 81 |     if action in ["launch", "stop", "start", "destroy"]:
 82 |         GlobalVar.CLUSTER_HOSTS = name + "-hosts"
 83 | 
 84 |     return opts, action, name
 85 | 
 86 | def launch_in_cluster_mode(cluster_name, opts):
 87 |     # check cluster status trickly
 88 |     if utils.check_cluster_status(cluster_name, ['Running', 'Stopped']):
 89 |         print "Cluster %s has been launched, please `Destroy` it first." % cluster_name
 90 |         sys.exit(1)
 91 |     do_validity_check(opts)
 92 | 
 93 |     if opts.slaves <= 0:
 94 |         print >> stderr, "ERROR: You have to start as least 1 slave"
 95 |         sys.exit(1)
 96 |     (masters, slaves, master_ip) = utils.launch_cluster(opts, cluster_name)
 97 |     utils.wait_for_cluster_state(
 98 |         cluster_state=['Running'],
 99 |         instances=masters + slaves)
100 |     utils.mount_disk(masters, slaves, opts)
101 |     spark.setup_cluster(masters, slaves, opts, True)
102 |     if opts.enable_spark_notebook:
103 |         spark_notebook.start_spark_notebook(masters, opts)
104 |     if opts.enable_hue:
105 |         hue.start_hue(masters, opts)
106 |     if opts.enable_hdfs:
107 |         hdfs.setup_hdfs(masters, slaves, opts)
108 |     if opts.enable_slave_public_ip:
109 |         utils.save_public_ips(masters, slaves)
110 |     utils.open_nginx(opts, masters)
111 | 
112 |     utils.end_of_startup(opts, master_ip, masters)
113 |     # update cluster status
114 |     os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
115 | 
116 | def destroy_in_cluster_mode(cluster_name, opts):
117 |     do_validity_check(opts)
118 |     print "Are you sure you want to destroy the cluster %s?" % cluster_name
119 |     print "The following instances will be terminated:"
120 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name)
121 |     if len(masters + slaves) <= 0:
122 |         print "There is no master or slave, check it first please."
123 |         sys.exit(1)
124 |     instances = masters + slaves
125 |     gateway = ecs.get_gateway_instance_info(opts)['InstanceId']
126 |     if gateway in instances:
127 |         instances.remove(gateway)
128 | 
129 |     to_release = []
130 |     for ins in instances:
131 |         try:
132 |             instance_info = ecs.get_instance_info(ins)
133 |             to_release.append(ins)
134 |             print "> %s" % (instance_info['HostName'])
135 |         except Exception, e:
136 |             if 'InvalidInstanceId.NotFound' in e.args:
137 |                 print "> %s, invalid `InstanceId` not found, skip it." % ins
138 |             else:
139 |                 raise e
140 | 
141 |     utils.warning()
142 |     msg = "All data on all nodes will be lost!!\nYou'd better stop it first. " \
143 |           "Destroy cluster %s (Y/n): " % cluster_name
144 |     to_destroy = raw_input(msg)
145 |     if to_destroy == "Y":
146 |         try:
147 |             ecs.release_ecs_instance(to_release)
148 |         except Exception, e:
149 |             print e, "\nReleasing ECS instances failed for some unknown reasons, " \
150 |                   "you can do it through: https://console.aliyun.com/ecs/index.htm"
151 |             raise e
152 |         finally:
153 |             utils.delete_file_safely(GlobalVar.CLUSTER_STATUS + cluster_name)
154 |             utils.delete_file_safely(GlobalVar.CLUSTER_INSTANCES + cluster_name)
155 |             utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS)
156 |             utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public")
157 |     else:
158 |         print "Not `Y`, give up destroying cluster %s" % cluster_name
159 | 
160 | def stop_in_cluster_mode(cluster_name, opts):
161 |     # check cluster status trickly
162 |     if utils.check_cluster_status(cluster_name, ['Stopped']):
163 |         print "Cluster %s has been `Stopped`, you can not stop it again." % cluster_name
164 |         sys.exit(1)
165 |     do_validity_check(opts)
166 | 
167 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name)
168 |     if len(masters + slaves) <= 0:
169 |         print "There is no master or slave running, check it first please."
170 |         sys.exit(1)
171 | 
172 |     print "==> Stopping Spark cluster..."
173 |     utils.warning()
174 |     msg = "Stopping Spark cluster will stop HDFS, spark-notebook and Hue at the same time. " \
175 |           "Stop it? (Y/n): "
176 |     to_stop = raw_input(msg)
177 |     if to_stop == "Y":
178 |         if opts.pwd == "":
179 |             opts.pwd = getpass.getpass("You need to provide the password for ECS instance:")
180 |         spark.stop_spark_cluster(masters, slaves, opts)
181 |         hdfs.stop_hdfs(masters, slaves, opts)
182 |         hue.stop_hue(masters, opts)
183 |         spark_notebook.stop_spark_notebook(masters, opts)
184 |         utils.stop_nginx(opts,masters)
185 |         # update cluster status
186 |         os.system("echo Stopped > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
187 |     else:
188 |         print "Not `Y`, give up stopping cluster %s" % cluster_name
189 | 
190 | def start_in_cluster_mode(cluster_name, opts):
191 |     # check cluster status trickly
192 |     if utils.check_cluster_status(cluster_name, ['Running']):
193 |         print "Cluster %s is `Running`, please `Stop` it first." % cluster_name
194 |         sys.exit(1)
195 |     do_validity_check(opts)
196 | 
197 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name)
198 |     if len(masters + slaves) <= 0:
199 |         print "There is no master or slave, check it first please."
200 |         sys.exit(1)
201 | 
202 |     print "==> Restarting spark cluster..."
203 |     if opts.pwd == "":
204 |         opts.pwd = getpass.getpass("You need to provide the password for ECS instance:")
205 |     master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0]
206 |     spark.start_spark_cluster(masters[0], slaves, opts)
207 |     if opts.enable_spark_notebook:
208 |         spark_notebook.start_spark_notebook(masters, opts)
209 |     if opts.enable_hue:
210 |         hue.start_hue(masters, opts)
211 |     if opts.enable_hdfs:
212 |         hdfs.setup_hdfs(masters, slaves, opts)
213 |     if opts.enable_slave_public_ip:
214 |         utils.save_public_ips(masters, slaves)
215 |     utils.open_nginx(opts, masters)
216 | 
217 |     utils.end_of_startup(opts, master_ip, masters)
218 |     # update cluster status
219 |     os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
220 | 
221 | def launch_in_client_mode(cluster_name, opts):
222 |     # check cluster status trickly
223 |     if utils.check_cluster_status(cluster_name, ['Running', 'Stopped']):
224 |         print "Cluster %s has been launched, please `Destroy` it first." % cluster_name
225 |         sys.exit(1)
226 |     do_validity_check(opts)
227 | 
228 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode)
229 |     if len(masters) <= 0:
230 |         print >> stderr, "ERROR: You have to start as least 1 master"
231 |         sys.exit(1)
232 |     if len(slaves) <= 0:
233 |         print >> stderr, "ERROR: You have to start as least 1 slave"
234 |         sys.exit(1)
235 | 
236 |     # Now we only support single-node master.
237 |     spark.setup_cluster(masters, slaves, opts, True)
238 |     if opts.enable_spark_notebook:
239 |         spark_notebook.start_spark_notebook(masters, opts)
240 |     if opts.enable_hue:
241 |         hue.start_hue(masters, opts)
242 |     if opts.enable_hdfs:
243 |         hdfs.setup_hdfs(masters, slaves, opts)
244 |     if opts.enable_slave_public_ip:
245 |         utils.save_public_ips(masters, slaves)
246 |     master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0]
247 | 
248 |     utils.open_nginx(opts, masters)
249 |     utils.end_of_startup(opts, master_ip, masters)
250 |     # update cluster status
251 |     os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
252 | 
253 | def destroy_in_client_mode(cluster_name, opts):
254 |     do_validity_check(opts)
255 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode)
256 |     if len(masters + slaves) <= 0:
257 |         print "There is no master or slave, check it first please."
258 |         sys.exit(1)
259 | 
260 |     print "Are you sure you want to destroy the cluster %s?" % cluster_name
261 |     print "The following instances will be terminated:"
262 |     instances = masters + slaves
263 |     gateway = ecs.get_gateway_instance_info(opts)['InstanceId']
264 |     if gateway in instances:
265 |         instances.remove(gateway)
266 |     to_release = []
267 |     for ins in instances:
268 |         try:
269 |             instance_info = ecs.get_instance_info(ins)
270 |             to_release.append(ins)
271 |             print "> %s" % (instance_info['HostName'])
272 |         except Exception, e:
273 |             if 'InvalidInstanceId.NotFound' in e.args:
274 |                 print "> %s, invalid `InstanceId` not found, skip it." % ins
275 |             else:
276 |                 raise e
277 | 
278 |     utils.warning()
279 |     msg = "All data on all nodes will be lost!!\nYou'd better stop it first. " \
280 |           "Destroy cluster %s (Y/n): " % cluster_name
281 |     to_destroy = raw_input(msg)
282 |     if to_destroy == "Y":
283 |         try:
284 |             ecs.release_ecs_instance(to_release)
285 |         except Exception, e:
286 |             print e, "Releasing ECS instances failed for some unknown reasons, " \
287 |                   "you can do it through: https://console.aliyun.com/ecs/index.htm"
288 |             raise e
289 |         finally:
290 |             utils.delete_file_safely(GlobalVar.CLUSTER_STATUS + cluster_name)
291 |             utils.delete_file_safely(GlobalVar.CLUSTER_INSTANCES + cluster_name)
292 |             utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS)
293 |             utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public")
294 |     else:
295 |         print "Not `Y`, give up destroying cluster %s" % cluster_name
296 |         sys.exit(1)
297 | 
298 | def stop_in_client_mode(cluster_name, opts):
299 |     # check cluster status trickly
300 |     if utils.check_cluster_status(cluster_name, ['Stopped']):
301 |         print "Cluster %s has been `Stopped`, you can not stop it again." % cluster_name
302 |         sys.exit(1)
303 |     do_validity_check(opts)
304 | 
305 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode)
306 |     if len(masters + slaves) <= 0:
307 |         print "There is no master or slave running, check it first please."
308 |         sys.exit(1)
309 | 
310 |     print "==> Stopping spark cluster..."
311 |     utils.warning()
312 |     msg = "Stopping Spark cluster will stop HDFS, spark-notebook and Hue at the same time. " \
313 |           "Stop %s? (Y/n): " % cluster_name
314 |     to_stop = raw_input(msg)
315 |     if to_stop == "Y":
316 |         if opts.pwd == "":
317 |             opts.pwd = getpass.getpass("You need to provide the password for ECS instance:")
318 |         spark.stop_spark_cluster(masters, slaves, opts)
319 |         hdfs.stop_hdfs(masters, slaves, opts)
320 |         hue.stop_hue(masters, opts)
321 |         spark_notebook.stop_spark_notebook(masters, opts)
322 |         utils.stop_nginx(opts,masters)
323 |         # update cluster status
324 |         os.system("echo Stopped > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
325 |     else:
326 |         print "Not `Y`, give up stopping cluster %s" % cluster_name
327 | 
328 | def start_in_client_mode(cluster_name, opts):
329 |     # check cluster status trickly
330 |     if utils.check_cluster_status(cluster_name, ['Running']):
331 |         print "Cluster %s is `Running`, please `Stop` it first." % cluster_name
332 |         sys.exit(1)
333 |     do_validity_check(opts)
334 | 
335 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode)
336 |     if len(masters + slaves) <= 0:
337 |         print "There is no master or slave, check it first please."
338 |         sys.exit(1)
339 | 
340 |     print "==> Restarting spark cluster..."
341 |     if opts.pwd == "":
342 |         opts.pwd = getpass.getpass("You need to provide the password for ECS instance:")
343 |     spark.start_spark_cluster(masters[0], slaves, opts)
344 |     if opts.enable_spark_notebook:
345 |         spark_notebook.start_spark_notebook(masters, opts)
346 |     if opts.enable_hue:
347 |         hue.start_hue(masters, opts)
348 |     if opts.enable_hdfs:
349 |         hdfs.setup_hdfs(masters, slaves, opts)
350 |     if opts.enable_slave_public_ip:
351 |         utils.save_public_ips(masters, slaves)
352 |     master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0]
353 |     utils.open_nginx(opts, masters)
354 |     utils.end_of_startup(opts, master_ip, masters)
355 |     # update cluster status
356 |     os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name))
357 | 
358 | def enable_module(name, opts):
359 |     if len(name.split(":")) != 2:
360 |         print "\nYou need to provide a <cluster_name>:<module_name>\n"
361 |         sys.exit(1)
362 |     cluster_name = name.split(":")[0]
363 |     module_name = name.split(":")[1]
364 |     do_validity_check(opts)
365 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name)
366 |     if module_name == "hdfs":
367 |         hdfs.setup_hdfs(masters, slaves, opts)
368 |     elif module_name == "hue":
369 |         hue.start_hue(masters, opts)
370 |     elif module_name == "spark-notebook":
371 |         spark_notebook.start_spark_notebook(masters, opts)
372 |     else:
373 |         print "Now we only support 3 module: hdfs, hue, spark-notebook"
374 |         sys.exit(1)
375 | 
376 | def disable_module(name, opts):
377 |     if len(name.split(":")) != 2:
378 |         print "\nYou need to provide a <cluster_name>:<module_name>\n"
379 |         sys.exit(1)
380 |     cluster_name = name.split(":")[0]
381 |     module_name = name.split(":")[1]
382 |     do_validity_check(opts)
383 |     (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name)
384 |     if module_name == "hdfs":
385 |         hdfs.stop_hdfs(masters, slaves, opts)
386 |     elif module_name == "hue":
387 |         hue.stop_hue(masters, opts)
388 |     elif module_name == "spark-notebook":
389 |         spark_notebook.stop_spark_notebook(masters, opts)
390 |     else:
391 |         print "Now we only support 3 module: hdfs, hue, spark-notebook"
392 |         sys.exit(1)
393 | 
394 | def do_validity_check(opts):
395 |     if opts.region is None:
396 |         length = len(GlobalVar.ECS_REGION)
397 |         print "There are %s regions available, listed as following:\n" % length
398 |         for id in range(1, length + 1):
399 |             print id, ":", GlobalVar.ECS_REGION["%s" % id]
400 |         print
401 |         msg = "Please specify the ECS region No. (like 1): "
402 |         opts.region = GlobalVar.ECS_REGION[raw_input(msg).strip()]
403 | 
404 |     if opts.pwd is None:
405 |         opts.pwd = getpass.getpass("""You need to provide a password for ECS instance.
406 | If `CLIENT` mode, you just need to provide login machine's password.
407 | If `CLUSTER` mode and `--include-gateway`, you just need to provide login machine's password.
408 | If `CLUSTER` mode only, you need to set a new default password for each ECS instance.
409 | Please set a password:""")
410 | 
411 | def real_main():
412 |     (opts, action, name) = parse_args()
413 |     utils.setup_sshpass()
414 |     utils.read_properties()
415 | 
416 |     if opts.mode is None:
417 |         msg = "Please specify the running mode, client/cluster: "
418 |         opts.mode = raw_input(msg).strip()
419 | 
420 |     try:
421 |         if action == "launch" and opts.mode == "cluster":
422 |             launch_in_cluster_mode(name, opts)
423 |         elif action == "destroy" and opts.mode == "cluster":
424 |             destroy_in_cluster_mode(name, opts)
425 |         elif action == "stop" and opts.mode == "cluster":
426 |             stop_in_cluster_mode(name, opts)
427 |         elif action == "start" and opts.mode == "cluster":
428 |             start_in_cluster_mode(name, opts)
429 |         elif action == "launch" and opts.mode == "client":
430 |             launch_in_client_mode(name, opts)
431 |         elif action == "destroy" and opts.mode == "client":
432 |             destroy_in_client_mode(name, opts)
433 |         elif action == "stop" and opts.mode == "client":
434 |             stop_in_client_mode(name, opts)
435 |         elif action == "start" and opts.mode == "client":
436 |             start_in_client_mode(name, opts)
437 |         elif action == "enable":
438 |             enable_module(name, opts)
439 |         elif action == "disable":
440 |             disable_module(name, opts)
441 |         else:
442 |             print "Wrong action or mode or module. We support: \n " \
443 |                   "6 actions: launch, stop, start, destroy, enable, disable \n " \
444 |                   "2 modes: client and cluster \n " \
445 |                   "3 modules: hdfs, hue, spark-notebook"
446 |     except RuntimeError as e:
447 |         utils.do_rollback()
448 | 
449 | def main():
450 |     try:
451 |         GlobalVar.SPARK_ECS_DIR = os.path.dirname(os.path.realpath(__file__))
452 |         real_main()
453 |     except UsageError, e:
454 |         print >> stderr, "\nERROR:\n",
455 |         sys.exit(1)
456 | 
457 | if __name__ == "__main__":
458 |     main()
459 | 
460 | 


--------------------------------------------------------------------------------
/bin/core/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #coding=utf-8
  3 | import sys
  4 | import os
  5 | import shutil
  6 | import pipes
  7 | import getpass
  8 | import subprocess
  9 | import textwrap
 10 | import time
 11 | import ConfigParser
 12 | import ecs
 13 | from datetime import datetime
 14 | from sys import stderr
 15 | from xml.etree import ElementTree as ET
 16 | from nginx import start_nginx, do_stop_nginx
 17 | from common import GlobalVar
 18 | 
 19 | class UsageError(Exception):
 20 |     pass
 21 | 
 22 | def setup_sshpass():
 23 |     try:
 24 |         print "==> Checking sshpass installed or not..."
 25 |         subprocess.check_call(['sshpass', '-V'])
 26 |     except Exception:
 27 |         print "Begin to setup sshpass..."
 28 |         try:
 29 |             subprocess.check_call(['yum', '-y', 'install', 'sshpass'])
 30 |         except Exception:
 31 |             subprocess.check_call(['apt-get', '-y', 'install', 'sshpass'])
 32 | 
 33 | def read_properties():
 34 |     if os.path.exists(GlobalVar.PROPERTY_FILE):
 35 |         cf = ConfigParser.ConfigParser()
 36 |         cf.read(GlobalVar.PROPERTY_FILE)
 37 |         GlobalVar.SPARK_INSTALL_DIR = cf.get('path', 'spark')
 38 |         GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR = cf.get('path', 'spark-notebook')
 39 |         GlobalVar.HUE_INSTALL_DIR = cf.get('path', 'hue')
 40 |         GlobalVar.HADOOP_INSTALL_DIR = cf.get('path', 'hadoop')
 41 |         GlobalVar.HADOOP_CONF_DIR = "%s/etc/hadoop" % GlobalVar.HADOOP_INSTALL_DIR
 42 | 
 43 | def save_masters_or_slaves(cluster_name, machine_type, instance_id):
 44 |     if instance_id is None:
 45 |         return
 46 |     dir = "%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name)
 47 |     if not os.path.exists(dir):
 48 |         os.makedirs(dir)
 49 |     file = "%s/%s" % (dir, machine_type)
 50 |     if not os.path.exists(file):
 51 |         f = open(file, 'w')
 52 |         f.close()
 53 |     os.system("echo %s >> %s" % (str(instance_id), file))
 54 | 
 55 | def get_masters_and_slaves(mode, cluster_name=""):
 56 |     masters = []
 57 |     slaves = []
 58 |     if mode == "client":
 59 |         masters_file = "%s/%s" % (GlobalVar.SPARK_ECS_DIR, "masters")
 60 |         slaves_file = "%s/%s" % (GlobalVar.SPARK_ECS_DIR, "slaves")
 61 |     else:
 62 |         masters_file = "%s/%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name, "masters")
 63 |         slaves_file = "%s/%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name, "slaves")
 64 | 
 65 |     if os.path.exists(masters_file):
 66 |         f = open(masters_file, 'r')
 67 |         for line in f.readlines():
 68 |             masters.append(line.strip())
 69 |     if os.path.exists(slaves_file):
 70 |         f = open(slaves_file, 'r')
 71 |         for line in f.readlines():
 72 |             slaves.append(line.strip())
 73 | 
 74 |     return masters, slaves
 75 | 
 76 | def match_and_change(property, tag, content):
 77 |     children = property.getchildren()
 78 |     if children[0].text == tag:
 79 |         children[1].text = content
 80 |         return
 81 | 
 82 | def update_hadoop_configuration(namenode_url):
 83 |     file = ET.parse(GlobalVar.HADOOP_CONF_DIR + '/core-site.xml')
 84 |     properties = file.findall('./property')
 85 |     for property in properties:
 86 |         match_and_change(property, 'fs.defaultFS', namenode_url)
 87 |     file.write(GlobalVar.HADOOP_CONF_DIR + '/core-site.xml', encoding="utf-8")
 88 | 
 89 | def ssh_args():
 90 |     parts = ['-o', 'StrictHostKeyChecking=no']
 91 |     parts += ['-o', 'UserKnownHostsFile=/dev/null']
 92 |     parts += ['-o', 'LogLevel=quiet']
 93 |     return parts
 94 | 
 95 | def ssh_command():
 96 |     return ['ssh'] + ssh_args()
 97 | 
 98 | def scp_command():
 99 |     return ['scp', '-r'] + ssh_args()
100 | 
101 | def stringify_command(parts):
102 |     if isinstance(parts, str):
103 |         return parts
104 |     else:
105 |         return ' '.join(map(pipes.quote, parts))
106 | 
107 | def is_ssh_available(ip, opts, print_ssh_output=True):
108 | 
109 |     s = subprocess.Popen(
110 |         ssh_command() + ['-t', '-t', '-o', 'ConnectTimeout=3',
111 |                          '%s@%s' % (opts.user, ip), stringify_command('true')],
112 |         stdout=subprocess.PIPE,
113 |         stderr=subprocess.STDOUT  # we pipe stderr through stdout to preserve output order
114 |     )
115 |     cmd_output = s.communicate()[0]  # [1] is stderr, which we redirected to stdout
116 | 
117 |     if s.returncode != 0 and print_ssh_output:
118 |         # extra leading newline is for spacing in wait_for_cluster_state()
119 |         print textwrap.dedent("""\n
120 |             Warning: SSH connection error. (This could be temporary.)
121 |             Host: {h}
122 |             SSH return code: {r}
123 |             SSH output: {o}
124 |         """).format(
125 |             h=ip,
126 |             r=s.returncode,
127 |             o=cmd_output.strip()
128 |         )
129 | 
130 |     return s.returncode == 0
131 | 
132 | def is_cluster_ssh_available(cluster_instances, opts):
133 |     for i in cluster_instances:
134 |         instance_info = ecs.get_instance_info(i)
135 |         ip = instance_info['InnerIpAddress']['IpAddress'][0]
136 |         if not is_ssh_available(ip, opts, True):
137 |             return False
138 |     else:
139 |         return True
140 | 
141 | def save_public_ips(masters, slaves):
142 |     cluster_hosts = open(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public", 'w')
143 | 
144 |     for node in masters + slaves:
145 |         instance_info = ecs.get_instance_info(node)
146 |         host = instance_info['HostName']
147 |         ip = instance_info['PublicIpAddress']['IpAddress'][0]
148 |         cluster_hosts.write(ip + "  " + host + "\n")
149 | 
150 |     cluster_hosts.close()
151 | 
152 | def check_cluster_status(cluster_name, status):
153 |     if not os.path.exists(GlobalVar.DEFAULT_CONF_DIR + "/status"):
154 |         os.mkdir(GlobalVar.DEFAULT_CONF_DIR + "/status")
155 |     if not os.path.exists(GlobalVar.CLUSTER_STATUS + cluster_name):
156 |         f = open(GlobalVar.CLUSTER_STATUS + cluster_name, "w")
157 |         f.close()
158 |         return False
159 |     f = open(GlobalVar.CLUSTER_STATUS + cluster_name, "r")
160 |     stat = f.readline().strip()
161 |     return stat in status
162 | 
163 | def delete_file_safely(path):
164 |     print "deleting %s" % path
165 |     if os.path.exists(path):
166 |         if os.path.isdir(path):
167 |             shutil.rmtree(path)
168 |         else:
169 |             os.remove(path)
170 | 
171 | def launch_cluster(opts, cluster_name):
172 |     if opts.pwd == "":
173 |         opts.pwd = getpass.getpass("""You need to provide a password for ECS instance.
174 | If `CLIENT` mode, you just need to provide login machine's password.
175 | If `CLUSTER` mode and `--include-gateway`, you just need to provide login machine's password.
176 | If `CLUSTER` mode only, you need to set a new default password for each ECS instance.
177 | Please set a password:""")
178 | 
179 |     if opts.ami is None:
180 |         print "You need to specify an available ECS image, listed as following: \n"
181 |         length = len(GlobalVar.AVAILABLE_SAPRK_VERSION)
182 |         for idx in range(1, length+1):
183 |             id = "%s" % idx
184 |             print idx, ': ', GlobalVar.AVAILABLE_SAPRK_VERSION[id]
185 |         print
186 |         msg = "Please choose an image No. (like: 1): "
187 |         id = raw_input(msg)
188 |         spark_version = GlobalVar.AVAILABLE_SAPRK_VERSION[id]
189 |         opts.ami = GlobalVar.SPARK_IMAGES[(spark_version, opts.region)]
190 | 
191 |     if opts.instance_type is None:
192 |         print "You need to specify the type of ECS instance, listed as following: \n\n" \
193 |               "%-14s: %s" % ("type name",  "(cores, memory)")
194 |         for instance_type in GlobalVar.ECS_INSTANCE_TYPE:
195 |             print "%-14s: %s" % (instance_type, GlobalVar.ECS_INSTANCE_TYPE[instance_type])
196 |         print
197 |         msg = "Please choose an ECS instance type (like: ecs.t1.small): "
198 |         opts.instance_type = str(raw_input(msg)).strip()
199 | 
200 |     print "==> Begin to launch Spark cluster..."
201 |     print_shopping_list(opts)
202 |     print "==> Setting internet security rules..."
203 |     current_group_id = ecs.get_gateway_instance_info(opts)['SecurityGroupIds']['SecurityGroupId'][0]
204 |     ecs.clear_security_group_rules(current_group_id, opts)
205 |     authorized_address = opts.authorized_address
206 |     ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '22/22', opts)
207 |     ecs.authorize_security_group_out(current_group_id, 'tcp', "", authorized_address, '1/65535', opts)
208 |     if opts.enable_slave_public_ip:
209 |         ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '8080/8080', opts)
210 |         ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '8081/8081', opts)
211 |         ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '9000/9000', opts)
212 | 
213 |     print "==> Launching master and slaves..."
214 |     # Launch slaves
215 |     master_instances = []
216 |     slave_instacens = []
217 |     count = 0
218 |     while (count < opts.slaves):
219 |         slave_instance_name = cluster_name + "-slave-%s" % (count)
220 |         slave_instance_id = ecs.launch_instance(opts, cluster_name, "slaves", opts.ami, opts.instance_type, current_group_id,
221 |                                             slave_instance_name, opts.ibo, slave_instance_name,
222 |                                             opts.pwd, open_public_ip=opts.enable_slave_public_ip)
223 |         slave_instacens.append(slave_instance_id)
224 |         count += 1
225 | 
226 |     if not opts.include_gateway:
227 |         # Launch master
228 |         master_instance_name = cluster_name + "-master"
229 |         master_instance_id = ecs.launch_instance(opts, cluster_name, "masters", opts.ami, opts.instance_type, current_group_id,
230 |                                              master_instance_name, opts.ibo, master_instance_name,
231 |                                              opts.pwd, open_public_ip=True)
232 |         master_instances.append(master_instance_id)
233 |     else:
234 |         gateway = ecs.get_gateway_instance_info(opts)['InstanceId']
235 |         master_instances.append(gateway)
236 |         save_masters_or_slaves(cluster_name, "masters", gateway)
237 | 
238 |     master_ip = ecs.get_instance_info(master_instances[0])['PublicIpAddress']['IpAddress'][0]
239 | 
240 |     return master_instances, slave_instacens, master_ip
241 | 
242 | def wait_for_cluster_state(cluster_state, instances):
243 |     sys.stdout.write("==> Waiting for cluster to enter one of `{s}` status .".format(s=cluster_state))
244 |     sys.stdout.flush()
245 | 
246 |     start_time = datetime.now()
247 |     while True:
248 |         time.sleep(5)
249 | 
250 |         all_instances_status = ecs.get_all_instances_status(instances)
251 |         if all(status in cluster_state for status in all_instances_status):
252 |             break
253 | 
254 |         sys.stdout.write(".")
255 |         sys.stdout.flush()
256 |     sys.stdout.write("\n")
257 | 
258 |     end_time = datetime.now()
259 |     print "Cluster is now in one of '{s}' status. Waited {t} seconds.".format(
260 |         s=cluster_state,
261 |         t=(end_time - start_time).seconds
262 |     )
263 | 
264 | def update_hosts(instance_id, opts, src, dst):
265 |     src_file = src + "/" + GlobalVar.CLUSTER_HOSTS
266 |     dst_file = dst + "/" + GlobalVar.CLUSTER_HOSTS
267 |     append_hosts = "cat %s >> /etc/hosts" % dst_file
268 |     remove_tmp_hosts = "rm -f %s" % dst_file
269 |     do_scp(instance_id, opts, src_file, dst_file)
270 |     do_ssh(instance_id, opts, append_hosts)
271 |     do_ssh(instance_id, opts, remove_tmp_hosts)
272 | 
273 | def do_scp(instance_id, opts, src, dst):
274 |     instance_info = ecs.get_instance_info(instance_id)
275 |     ip = instance_info['InnerIpAddress']['IpAddress'][0]
276 |     tries = 0
277 |     while True:
278 |         try:
279 |             res = subprocess.check_call(
280 |                 ["sshpass", "-p", opts.pwd] +
281 |                 scp_command() + [src, '%s@%s:%s' % (opts.user, ip, dst)])
282 |             if res != 0:
283 |                 raise RuntimeError("Error executing remote command.")
284 |             return res
285 |         except subprocess.CalledProcessError as e:
286 |             if tries > 5:
287 |                 # If this was an ssh failure, provide the user with hints.
288 |                 if e.returncode == 255:
289 |                     raise UsageError(
290 |                         "Failed to SSH to remote host {0}.\n".format(ip))
291 |                 else:
292 |                     raise e
293 |             print >> stderr, \
294 |                 "Error executing remote command, retrying after 10 seconds."
295 |             time.sleep(10)
296 |             tries += 1
297 | 
298 | def do_ssh(instance_id, opts, command):
299 |     instance_info = ecs.get_instance_info(instance_id)
300 |     ip = instance_info['InnerIpAddress']['IpAddress'][0]
301 |     tries = 0
302 |     while True:
303 |         try:
304 |             res = subprocess.check_call(
305 |                 ["sshpass", "-p", opts.pwd] +
306 |                 ssh_command() + ['-t', '-t', '%s@%s' % (opts.user, ip),
307 |                                  stringify_command(command)])
308 |             if res != 0:
309 |                 raise RuntimeError("Error executing remote command.")
310 |             return res
311 |         except subprocess.CalledProcessError as e:
312 |             if tries > 5:
313 |                 # If this was an ssh failure, provide the user with hints.
314 |                 if e.returncode == 255:
315 |                     raise UsageError(
316 |                         "Failed to SSH to remote host {0}.\n".format(ip))
317 |                 else:
318 |                     raise e
319 |             print >> stderr, \
320 |                 "Error executing remote command, retrying after 10 seconds."
321 |             time.sleep(10)
322 |             tries += 1
323 | 
324 | def _check_output(*popenargs, **kwargs):
325 |     if 'stdout' in kwargs:
326 |         raise ValueError('stdout argument not allowed, it will be overridden.')
327 |     process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
328 |     output, unused_err = process.communicate()
329 |     retcode = process.poll()
330 |     if retcode:
331 |         cmd = kwargs.get("args")
332 |         if cmd is None:
333 |             cmd = popenargs[0]
334 |         raise subprocess.CalledProcessError(retcode, cmd, output=output)
335 |     return output
336 | 
337 | def ssh_read(instance_id, opts, command):
338 |     instance_info = ecs.get_instance_info(instance_id)
339 |     ip = instance_info['InnerIpAddress']['IpAddress'][0]
340 |     return _check_output(
341 |         ["sshpass", "-p", opts.pwd] + ssh_command() + ['%s@%s' % (opts.user, ip), stringify_command(command)])
342 | 
343 | def ssh_write(instance_id, opts, command, arguments):
344 |     instance_info = ecs.get_instance_info(instance_id)
345 |     ip = instance_info['InnerIpAddress']['IpAddress'][0]
346 |     tries = 0
347 |     while True:
348 |         proc = subprocess.Popen(
349 |             ["sshpass", "-p", opts.pwd] +
350 |             ssh_command() + ['%s@%s' % (opts.user, ip), stringify_command(command)],
351 |             stdin=subprocess.PIPE)
352 |         proc.stdin.write(arguments)
353 |         proc.stdin.close()
354 |         status = proc.wait()
355 |         if status == 0:
356 |             break
357 |         elif tries > 5:
358 |             raise RuntimeError("ssh_write failed with error %s" % proc.returncode)
359 |         else:
360 |             print >> stderr, \
361 |                 "Error {0} while executing remote command, retrying after 10 seconds".format(status)
362 |             time.sleep(10)
363 |             tries = tries + 1
364 | 
365 | def prepare_hosts(master, slaves, opts):
366 |     cluster_hosts = open(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS, 'w')
367 |     instance_info = ecs.get_instance_info(master)
368 |     host = instance_info['HostName']
369 |     ip = instance_info['InnerIpAddress']['IpAddress'][0]
370 |     cluster_hosts.write(ip + "  " + host + "\n")
371 | 
372 |     for slave in slaves:
373 |         instance_info = ecs.get_instance_info(slave)
374 |         host = instance_info['HostName']
375 |         ip = instance_info['InnerIpAddress']['IpAddress'][0]
376 |         cluster_hosts.write(ip + "  " + host + "\n")
377 | 
378 |     cluster_hosts.close()
379 |     update_hosts(master, opts, GlobalVar.SPARK_ECS_DIR, "/root/")
380 |     for slave in slaves:
381 |         update_hosts(slave, opts, GlobalVar.SPARK_ECS_DIR, "/root/")
382 | 
383 | def mount_disk(masters, slaves, opts):
384 |     print "==> mounting data disk: /dev/xvdb ..."
385 |     src = "%s/sh/mount_disk.sh" % GlobalVar.SPARK_ECS_DIR
386 |     dst = "/root/"
387 |     command = "/bin/bash /root/mount_disk.sh > /dev/null 2>&1"
388 |     for ins in masters + slaves:
389 |         do_scp(ins, opts, src, dst)
390 |         do_ssh(ins, opts, command)
391 |     print "==> mounted OK..."
392 | 
393 | # def update_default_output(opts):
394 | 
395 | 
396 | def open_nginx(opts,masters):
397 |     print "==> Starting nginx service..."
398 |     host_info_path = GlobalVar.CLUSTER_HOSTS
399 |     master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0]
400 |     result_code = start_nginx(opts, host_info_path, master_ip)
401 |     if result_code == 1:
402 |         print("[success] start nginx succcess ...")
403 |     else:
404 |         print("[error] start nginx failed ...")
405 | 
406 | def stop_nginx(opts, masters):
407 |     print "==> Stopping nginx service..."
408 |     master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0]
409 |     result_code = do_stop_nginx(opts, master_ip)
410 |     if result_code == 1:
411 |         print("[success] stop nginx succcess ...")
412 |     else:
413 |         print("[error] stop nginx failed ...")
414 | 
415 | def do_rollback():
416 |     print "==> Doing rollback..."
417 |     # TODO:
418 | 
419 | def welcome():
420 |     print """
421 | Welcome to:
422 |       ____              __                    _____________
423 |      / __/__  ___ _____/ /__    ___  ____    /___/___//___/
424 |     _\ \/ _ \/ _ `/ __/  '_/   / _ \/__ /   /___//___.\ \.
425 |    /___/ .__/\_,_/_/ /_/\_\    \__./_/_/   /___/____/___/
426 |       /_/                                                  version 0.1
427 | 
428 | Type --help for more information.
429 |     """
430 | 
431 | def print_shopping_list(opts):
432 |     (cores, memory) = GlobalVar.ECS_INSTANCE_TYPE[opts.instance_type]
433 |     if opts.disk_size is None:
434 |         disk_size = "None"
435 |     else:
436 |         disk_size = "%sG" % opts.disk_size
437 |     current_group_id = ecs.get_gateway_instance_info(opts)['SecurityGroupIds']['SecurityGroupId'][0]
438 |     if opts.enable_slave_public_ip:
439 |         slave_internet_charge_type = "PayByTraffic"
440 |         slave_internet_bandwidth_out = opts.ibo
441 |     else:
442 |         slave_internet_charge_type = "PayByBandwidth"
443 |         slave_internet_bandwidth_out = "0"
444 | 
445 |     print """The ECS instance configuration listed as following:
446 | 
447 | +--------------------------------------------------------+
448 | +                   Check  List                          +
449 | +--------------------------------------------------------+"""
450 |     if not opts.include_gateway:
451 |         print """
452 |  Running Mode:                      %s
453 | 
454 |  Master Instance:
455 |        Number:                      %s
456 |        Region:                      %s
457 |        Zone:                        %s
458 |        Image:                       %s
459 |        Cores:                       %s
460 |        Memory:                      %sG
461 |        Disk:                        %s
462 |        InstanceType:                %s
463 |        SecurityGroup:               %s
464 |        InternetChargeType:          %s
465 |        InternetMaxBandwidthOut:     %s
466 |        """ % (opts.mode, "1", opts.region, opts.zone, opts.ami, cores, memory,
467 |               disk_size, opts.instance_type, current_group_id, "PayByTraffic", opts.ibo)
468 |     print """
469 |  Slave Instance:
470 |        Number:                      %s
471 |        Region:                      %s
472 |        Zone:                        %s
473 |        Image:                       %s
474 |        Cores:                       %s
475 |        Memory:                      %sG
476 |        Disk:                        %s
477 |        InstanceType:                %s
478 |        SecurityGroup:               %s
479 |        InternetChargeType:          %s
480 |        InternetMaxBandwidthOut:     %s
481 | +--------------------------------------------------------+
482 |         """ % (opts.slaves, opts.region, opts.zone, opts.ami, cores, memory, opts.instance_type,
483 |                disk_size, current_group_id, slave_internet_charge_type, slave_internet_bandwidth_out)
484 |     msg = "Continue buying? (Y/n): "
485 |     to_buy = raw_input(msg)
486 |     if to_buy != "Y":
487 |         print "Not `Y`, give up buying ECS instances, Goodbye!"
488 |         sys.exit(1)
489 | 
490 | def end_of_startup(opts, master_ip, masters):
491 |     master_name = ecs.get_instance_info(masters[0])['HostName']
492 |     print """
493 | +--------------------------------------------------------+
494 | +        Spark Cluster Started Successfully!             +
495 | +--------------------------------------------------------+
496 | The Spark Cluster Configuration listed as following:
497 | 
498 |     Spark Cluster:
499 | 
500 |         Master Node IP:  %s
501 |         Spark UI:        http://%s:8080
502 |         Master URL:      spark://%s:7077
503 | 
504 |     """ % (master_ip, master_ip, master_name)
505 | 
506 |     if opts.enable_hdfs:
507 |         print """
508 |     HDFS NameNode URL:   hdfs://%s:9000
509 |         """ % master_ip
510 | 
511 |     if opts.enable_spark_notebook:
512 |         print """
513 |     Spark Notebook:      http://%s:9090
514 |         """ % master_ip
515 |     if opts.enable_hue:
516 |         print """
517 |     Hue:                 http://%s:8888
518 |         """ % master_ip
519 |     print"""
520 | +--------------------------------------------------------+
521 |         """
522 | 
523 | def warning():
524 |     print """
525 | **********************************************************
526 | **                     WARNING!!!                       **
527 | **********************************************************
528 |     """
529 | 


--------------------------------------------------------------------------------