├── bin ├── core │ ├── __init__.py │ ├── nginx.py │ ├── common.py │ ├── config_nginx.py │ ├── ecs.py │ └── utils.py ├── service │ ├── __init__.py │ ├── spark_notebook.py │ ├── hue.py │ ├── spark.py │ └── hdfs.py ├── conf │ └── nginx.conf.template ├── sh │ └── mount_disk.sh └── spark_ecs.py ├── ecs-image-list ├── cn-beijing │ └── ecs-image-id ├── cn-hangzhou │ └── ecs-image-id ├── cn-qingdao │ └── ecs-image-id └── cn-shenzhen │ └── ecs-image-id ├── README.md ├── doc ├── ssh_tunnel.md └── manual.md └── LICENSE /bin/core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /bin/service/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ecs-image-list/cn-beijing/ecs-image-id: -------------------------------------------------------------------------------- 1 | m-25dt21m47 -------------------------------------------------------------------------------- /ecs-image-list/cn-hangzhou/ecs-image-id: -------------------------------------------------------------------------------- 1 | m-23xecoatf -------------------------------------------------------------------------------- /ecs-image-list/cn-qingdao/ecs-image-id: -------------------------------------------------------------------------------- 1 | m-28w0wqwa6 -------------------------------------------------------------------------------- /ecs-image-list/cn-shenzhen/ecs-image-id: -------------------------------------------------------------------------------- 1 | m-94ksoicp4 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark on ECS 2 | 3 | ## 介绍 4 | 1. [脚本使用指南](doc/manual.md) 5 | 2. [SSH隧道配置](doc/ssh_tunnel.md) 6 | 7 | ## 特性 8 | -------- 9 | 1. 快速搭建基于ECS的Spark集群 10 | 通过脚本一键式快速搭建高效、稳定的Spark集群运行环境,并可以通过命令行快捷、简单的管理集群的生命周期 11 | 2. Spark工具支持 12 | 运行环境集成Spark-notebook、Hue等,并开放原生的Spark UI,可以方便快捷的通过图形界面运行、调试代码和监控作业 13 | 3. 安全 14 | 所有Web页面的访问均采用SSH隧道技术,可以在非安全环境下使用端口转发来加密Web页面的访问,保护个人隐私以及重要商业信息 15 | 4. 开放 16 | 所有软件均使用社区开源版本,代码开放透明、文档丰富 17 | 18 | -------------------------------------------------------------------------------- /bin/conf/nginx.conf.template: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | error_log logs/error.log; 3 | error_log logs/error.log notice; 4 | error_log logs/error.log info; 5 | pid logs/nginx.pid; 6 | 7 | events { 8 | worker_connections 1024; 9 | } 10 | 11 | http { 12 | include mime.types; 13 | default_type pplication/octet-stream; 14 | sendfile off; 15 | keepalive_timeout 20; 16 | gzip on; 17 | proxy_intercept_errors off; 18 | 19 | ${upstream_place_holder} 20 | 21 | ${server_place_holder} 22 | } -------------------------------------------------------------------------------- /bin/service/spark_notebook.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import os 4 | import sys 5 | from core import ecs, utils 6 | from core.common import GlobalVar 7 | 8 | def start_spark_notebook(masters, opts): 9 | print "==> Starting Spark Notebook service..." 10 | master = masters[0] 11 | ins = ecs.get_instance_info(master) 12 | ip = ins['InnerIpAddress']['IpAddress'][0] 13 | launch_notebook = ' \" cd %s; nohup ./bin/spark-notebook -Dhttp.port=9090 > /dev/null 2>&1 & \" ' \ 14 | % GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR 15 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_notebook)) 16 | print "==> Started Spark Notebook service successfully..." 17 | 18 | def stop_spark_notebook(masters, opts): 19 | print "==> Stopping Spark Notebook..." 20 | master = masters[0] 21 | ins = ecs.get_instance_info(master) 22 | ip = ins['InnerIpAddress']['IpAddress'][0] 23 | stop_notebook = ' \" cd %s; cat RUNNING_PID | xargs -r kill -9; rm -f RUNNING_PID \" ' \ 24 | % GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR 25 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_notebook)) 26 | print "==> Stopped Spark Notebook service successfully..." 27 | -------------------------------------------------------------------------------- /bin/sh/mount_disk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /amrdata 4 | 5 | if which mkfs.ext4 > /dev/null ;then 6 | if ls /dev/xvdb1 > /dev/null;then 7 | if cat /etc/fstab|grep /amrdata > /dev/null ;then 8 | if cat /etc/fstab|grep /amrdata|grep ext3 > /dev/null ;then 9 | sed -i "/\/amrdata/d" /etc/fstab 10 | echo '/dev/xvdb1 /amrdata ext4 defaults 0 0' >> /etc/fstab 11 | fi 12 | else 13 | echo '/dev/xvdb1 /amrdata ext4 defaults 0 0' >> /etc/fstab 14 | fi 15 | mount -a 16 | echo "" 17 | exit; 18 | else 19 | if ls /dev/xvdb ;then 20 | fdisk /dev/xvdb << EOF 21 | n 22 | p 23 | 1 24 | 25 | 26 | wq 27 | EOF 28 | mkfs.ext4 /dev/xvdb1 29 | echo '/dev/xvdb1 /amrdata ext4 defaults 0 0' >> /etc/fstab 30 | fi 31 | fi 32 | else 33 | if ls /dev/xvdb1 > /dev/null;then 34 | if cat /etc/fstab|grep /amrdata > /dev/null ;then 35 | echo "" 36 | else 37 | echo '/dev/xvdb1 /amrdata ext3 defaults 0 0' >> /etc/fstab 38 | fi 39 | mount -a 40 | echo "" 41 | exit; 42 | else 43 | if ls /dev/xvdb ;then 44 | fdisk /dev/xvdb << EOF 45 | n 46 | p 47 | 1 48 | 49 | 50 | wq 51 | EOF 52 | mkfs.ext3 /dev/xvdb1 53 | echo '/dev/xvdb1 /amrdata ext3 defaults 0 0' >> /etc/fstab 54 | fi 55 | fi 56 | fi 57 | 58 | mount -a -------------------------------------------------------------------------------- /bin/core/nginx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import sys 4 | import os 5 | import utils 6 | from core.common import GlobalVar 7 | from config_nginx import generate_config_file 8 | 9 | def copy_file(opts, src_file, ip, dst): 10 | try: 11 | os.system("sshpass -p %s scp -r %s %s %s@%s:%s" % (opts.pwd, " ".join(utils.ssh_args()), src_file, opts.user, ip, dst)) 12 | except Exception as e: 13 | print(e.message) 14 | raise e 15 | 16 | def execute_remote_command(opts, ip, command): 17 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, command)) 18 | 19 | def execute_local_command(command): 20 | os.system(command) 21 | 22 | def start_nginx(opts, host_info_file, ip): 23 | try: 24 | nginx_config_template_file = "%s/conf/nginx.conf.template" % GlobalVar.SPARK_ECS_DIR 25 | local_nginx_config = "%s/conf/nginx.conf" % GlobalVar.SPARK_ECS_DIR 26 | dst = "/opt/nginx-1.9.1/conf/nginx.conf" 27 | generate_config_file(host_info_file, nginx_config_template_file, local_nginx_config) 28 | copy_file(opts, local_nginx_config, ip, dst) 29 | start_nginx_command = "/opt/nginx-1.9.1/sbin/nginx" 30 | execute_remote_command(opts, ip, start_nginx_command) 31 | return 1 32 | except Exception as e: 33 | print "start nginx failed %s" % str(e.message) 34 | return -1 35 | 36 | def do_stop_nginx(opts,ip): 37 | try: 38 | stopNginxCommand = "/opt/nginx-1.9.1/sbin/nginx -s stop" 39 | execute_remote_command(opts, ip, stopNginxCommand) 40 | return 1 41 | except Exception as e: 42 | print "stop nginx filed "+str(e.message) 43 | return -1 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /bin/core/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import os 4 | import sys 5 | 6 | class GlobalVar: 7 | 8 | DEFAULT_CONF_DIR = "/root/.config" 9 | PROPERTY_FILE = "%s/packages.property" % DEFAULT_CONF_DIR 10 | HADOOP_INSTALL_DIR = "/opt/hadoop" 11 | HADOOP_CONF_DIR = "%s/etc/hadoop" % HADOOP_INSTALL_DIR 12 | SPARK_INSTALL_DIR = "/opt/spark" 13 | SPARK_CONF_DIR = "%s/conf" % SPARK_INSTALL_DIR 14 | SPARK_NOTEBOOK_INSTALL_DIR = "/opt/spark-notebook" 15 | HUE_INSTALL_DIR = "/opt/hue" 16 | ALIYUN_SDK_URL = "http://docs-aliyun-com-cn-b.oss-cn-hangzhou.aliyuncs.com/ecs/assets/sdk/python_sdk.tgz" 17 | SPARK_ECS_DIR = "" 18 | CLUSTER_STATUS = "%s/status/cluster-" % DEFAULT_CONF_DIR 19 | CLUSTER_INSTANCES = "%s/instances/" % DEFAULT_CONF_DIR 20 | CLUSTER_HOSTS = "" 21 | 22 | ECS_API_PAGESIZE = 50 23 | 24 | ECS_INSTANCE_TYPE = { 25 | "ecs.t1.small": (1, 1), 26 | "ecs.s1.small": (1, 2), 27 | "ecs.s1.medium": (1, 4), 28 | "ecs.s2.small": (2, 2), 29 | "ecs.s2.large": (2, 4), 30 | "ecs.s2.xlarge": (2, 8), 31 | "ecs.s3.medium": (4, 4), 32 | "ecs.s3.large": (4, 8), 33 | "ecs.m1.medium": (4, 16) 34 | } 35 | 36 | ECS_REGION = { 37 | "1": "cn-hangzhou", 38 | "2": "cn-shenzhen", 39 | "3": "cn-beijing", 40 | "4": "cn-qingdao" 41 | } 42 | 43 | SPARK_IMAGES = { 44 | ("Spark-1.3.1", "cn-hangzhou"): "m-23xecoatf", 45 | ("Spark-1.3.1", "cn-shenzhen"): "m-94ksoicp4", 46 | ("Spark-1.3.1", "cn-beijing"): "m-25dt21m47", 47 | ("Spark-1.3.1", "cn-qingdao"): "m-28w0wqwa6" 48 | } 49 | 50 | AVAILABLE_SAPRK_VERSION = { 51 | "1": "Spark-1.3.1" 52 | } 53 | -------------------------------------------------------------------------------- /bin/service/hue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import os 4 | import sys 5 | from core import ecs, utils 6 | from core.common import GlobalVar 7 | 8 | def start_hue(masters, opts): 9 | print "==> Starting HUE service..." 10 | master = masters[0] 11 | ins = ecs.get_instance_info(master) 12 | ip = ins['InnerIpAddress']['IpAddress'][0] 13 | copy_command = ' \"/bin/cp -r %s/hue/desktop/conf/hue.ini %s/desktop/conf/ \"' \ 14 | % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.HUE_INSTALL_DIR) 15 | launch_hue_step1 = ' \"source /root/.bash_profile; cd %s/build/env/bin/; nohup ./hue livy_server > /dev/null 2>&1 & \" ' \ 16 | % GlobalVar.HUE_INSTALL_DIR 17 | launch_hue_step2 = ' \"source /root/.bash_profile; cd %s/build/env/bin/; nohup ./supervisor > /dev/null 2>&1 & \" ' \ 18 | % GlobalVar.HUE_INSTALL_DIR 19 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, copy_command)) 20 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_hue_step1)) 21 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, launch_hue_step2)) 22 | print "==> Started HUE service successfully" 23 | 24 | def stop_hue(masters, opts): 25 | print "==> Stopping HUE service..." 26 | master = masters[0] 27 | ins = ecs.get_instance_info(master) 28 | ip = ins['InnerIpAddress']['IpAddress'][0] 29 | stop_hue_step1 = ' \" pgrep supervisor | xargs -r kill -9 \" ' 30 | stop_hue_step2 = ' \" ps -ef | grep livy.server.Main | grep -v grep | awk \'{print \$2}\' | xargs -r kill -9 \" ' 31 | stop_hue_step3 = ' \" pgrep hue | xargs -r kill -9 \" ' 32 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step1)) 33 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step2)) 34 | os.system("sshpass -p %s ssh %s %s@%s %s" % (opts.pwd, " ".join(utils.ssh_args()), opts.user, ip, stop_hue_step3)) 35 | print "==> Stopped HUE service successfully" 36 | -------------------------------------------------------------------------------- /bin/service/spark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import sys 4 | import os 5 | from core import ecs, utils 6 | from core.common import GlobalVar 7 | 8 | def setup_cluster(masters, slaves, opts, deploy_ssh_key): 9 | master = masters[0] 10 | if deploy_ssh_key: 11 | print "==> Generating cluster's SSH key on master..." 12 | key_setup = """ 13 | [ -f ~/.ssh/id_rsa ] || 14 | (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && 15 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) 16 | """ 17 | utils.do_ssh(master, opts, key_setup) 18 | dot_ssh_tar = utils.ssh_read(master, opts, ['tar', 'c', '.ssh']) 19 | print "==> Transferring cluster's SSH key to slaves..." 20 | for slave in slaves: 21 | utils.ssh_write(slave, opts, ['tar', 'x'], dot_ssh_tar) 22 | 23 | print "==> Updating /etc/hosts for each ECS instance..." 24 | utils.prepare_hosts(master, slaves, opts) 25 | 26 | print "==> Updating Spark default configuration..." 27 | # copy default hadoop config 28 | os.system(" /bin/cp -r %s/spark/conf/* %s" 29 | % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.SPARK_CONF_DIR)) 30 | utils.do_scp(masters[0], opts, GlobalVar.SPARK_CONF_DIR, GlobalVar.SPARK_INSTALL_DIR) 31 | for slave in slaves: 32 | utils.do_scp(slave, opts, GlobalVar.SPARK_CONF_DIR, GlobalVar.SPARK_INSTALL_DIR) 33 | 34 | print "==> Starting spark cluster..." 35 | start_spark_cluster(master, slaves, opts) 36 | 37 | def start_spark_cluster(master, slaves, opts): 38 | ins = ecs.get_instance_info(master) 39 | master_name = ins['HostName'] 40 | start_master = "%s/sbin/start-master.sh " % GlobalVar.SPARK_INSTALL_DIR 41 | utils.do_ssh(master, opts, str(start_master)) 42 | for slave in slaves: 43 | instance_info = ecs.get_instance_info(slave) 44 | worker_name = instance_info['HostName'] 45 | start_slave = "%s/sbin/start-slave.sh %s spark://%s:7077" \ 46 | % (GlobalVar.SPARK_INSTALL_DIR, worker_name, master_name) 47 | utils.do_ssh(slave, opts, str(start_slave)) 48 | print "==> Started spark cluster successfully!" 49 | 50 | def stop_spark_cluster(masters, slaves, opts): 51 | master = masters[0] 52 | stop_master = "%s/sbin/stop-master.sh " % GlobalVar.SPARK_INSTALL_DIR 53 | print "==> Stopping Spark Master..." 54 | utils.do_ssh(master, opts, str(stop_master)) 55 | 56 | print "==> Stopping Spark Slaves..." 57 | for slave in slaves: 58 | instance_info = ecs.get_instance_info(slave) 59 | worker_name = instance_info['HostName'] 60 | stop_slave = "%s/sbin/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker %s" \ 61 | % (GlobalVar.SPARK_INSTALL_DIR, worker_name) 62 | utils.do_ssh(slave, opts, str(stop_slave)) -------------------------------------------------------------------------------- /bin/service/hdfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import os 4 | import sys 5 | from core import ecs, utils 6 | from core.common import GlobalVar 7 | 8 | def setup_hdfs(masters, slaves, opts): 9 | print "==> Updating Hadoop configuration for each ECS instance..." 10 | # copy default hadoop config 11 | os.system(" /bin/cp -r %s/hadoop/etc/hadoop/* %s/etc/hadoop/" 12 | % (GlobalVar.DEFAULT_CONF_DIR, GlobalVar.HADOOP_INSTALL_DIR)) 13 | 14 | master_intranet_ip = ecs.get_instance_info(masters[0])['InnerIpAddress']['IpAddress'][0] 15 | namenode = "hdfs://%s:9000" % master_intranet_ip 16 | utils.update_hadoop_configuration(namenode) 17 | utils.do_scp(masters[0], opts, GlobalVar.HADOOP_CONF_DIR, "%s/etc/" % GlobalVar.HADOOP_INSTALL_DIR) 18 | for slave in slaves: 19 | utils.do_scp(slave, opts, GlobalVar.HADOOP_CONF_DIR, "%s/etc/" % GlobalVar.HADOOP_INSTALL_DIR) 20 | 21 | print "==> Starting HDFS service..." 22 | start_hdfs(masters[0], slaves, opts) 23 | print "==> Started HDFS service successfully" 24 | 25 | def start_hdfs(master, slaves, opts): 26 | utils.warning() 27 | msg = "If this is the first time, you need to format HDFS, otherwise you should not format it! \n" \ 28 | "Format HDFS (Y/n): " 29 | confirm = raw_input(msg) 30 | if confirm == 'Y': 31 | msg = "Confirm to format HDFS? (Y/n): " 32 | confirm_again = raw_input(msg) 33 | if confirm_again == "Y": 34 | print "==> Formatting HDFS..." 35 | format_hdfs = "%s/bin/hdfs namenode -format -force 2> /dev/null" % GlobalVar.HADOOP_INSTALL_DIR 36 | utils.do_ssh(master, opts, str(format_hdfs)) 37 | else: 38 | print "==> Not `Y`, skipping formatting HDFS..." 39 | else: 40 | print "==> Not `Y`, skipping formatting HDFS..." 41 | 42 | print "==> Starting namenode..." 43 | start_namenode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs start namenode" \ 44 | % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR) 45 | utils.do_ssh(master, opts, start_namenode) 46 | 47 | print "==> Starting datanode..." 48 | for slave in slaves: 49 | start_datanode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs start datanode" \ 50 | % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR) 51 | utils.do_ssh(slave, opts, start_datanode) 52 | 53 | def stop_hdfs(masters, slaves, opts): 54 | print "==> Stopping namenode..." 55 | master = masters[0] 56 | stop_namenode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs stop namenode" \ 57 | % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR) 58 | utils.do_ssh(master, opts, stop_namenode) 59 | 60 | print "==> Stopping datanodes..." 61 | for slave in slaves: 62 | stop_datanode = "%s/sbin/hadoop-daemon.sh --config %s --script hdfs stop datanode" \ 63 | % (GlobalVar.HADOOP_INSTALL_DIR, GlobalVar.HADOOP_CONF_DIR) 64 | utils.do_ssh(slave, opts, stop_datanode) 65 | print "==> Stopped HDFS service successfully" 66 | -------------------------------------------------------------------------------- /doc/ssh_tunnel.md: -------------------------------------------------------------------------------- 1 | # 打通SSH隧道 2 | 3 | 打通`本机 <--> Spark Master`, 以便在本机访问Spark UI, Hue, Spark Notebook. 4 | 5 | 要连接主节点的 SparkUI、HUE、Spark-notebook的UI界面,需要创建本机到Spark主节点的SSH隧道,以本地端口转发到远程端口的安全的方式访问。具体的创建步骤如下: 6 | 7 | ## SSH客户端配置 8 | 9 | 支持PuTTY(windows)或OpenSSH(linux、Max OSX) 10 | 11 | ### windows相关配置 12 | 13 | 1. 首先[下载PuTTY](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html) 14 | 2. 配置PuTTY 15 | * 首先创建一个session并配置好Master的IP地址和22端口号并保存session。这一步的目的是能连接到SSH Server建立一个SSH通道 16 | ![](http://i.imgur.com/AgmjuGL.jpg) 17 | * 切换到Tunnel面板,分别配置Source Port和 Destination的IP端口,然后点击Add保存端口转发映射 18 | ![](http://i.imgur.com/MWOj90s.jpg) 19 | 20 | 3. 点击open按钮,输入用户名密码登陆 21 | 这样就建立好了一个带有端口转发的SSH隧道。访问`http://127.0.0.1:8888`端口的请求就会被转发到远程机器的`9000`端口。通过此方式,就可以安全的访问Spark UI、spark-notebook、和HUE的页面了。 22 | 23 | ### Linux相关配置 24 | 25 | 1. 安装openssh (ECS默认都有安装) 26 | 2. 执行命令 `ssh -N -f -L port1:127.0.0.1:port2 username@ip` 27 | 28 | *参数说明* 29 | 30 | 参数 | 描述 31 | ------------ | ------------- 32 | -N | 参数告诉SSH客户端,改命令仅仅做端口转发 33 | -f|告诉SSH客户端在后台运行 34 | -L|做本地映射端口 35 | port1|要使用的本地端口 36 | port2|要映射的远程端口 37 | username|登陆远程机器的用户名 38 | ip|要建立通道的远程机器的IP 39 | 40 | 41 | > 连接成功后,在浏览器访问 127.0.0.1:port1 就可以被转发到服务器的 ip:port2端口了 42 | > 因为直接访问服务的器的目标端口是被防火墙屏蔽的,所以SSH隧道技术,可以绕过防火墙的设置,并提供了一个安全访问的机制。 43 | 44 | ## 使用SparkUI、spark-notebook、Hue 45 | 46 | 请确保上文中SSH隧道能够打通 47 | 48 | ### Web服务的端口映射绑定 49 | 50 | SparkUI的配置: 51 | 52 | #### Linux命令行执行如下命令 53 | 1. 将本地`8081`绑定到远程`80`端口 54 | `ssh -N -f -L 8081:127.0.0.1:80 username@ip` 55 | > username和ip分别为登陆master机器的username和IP 56 | 2. 将本地`80`绑定到远程`80`端口 57 | `ssh -N -f -L 80:127.0.0.1:80 username@ip` 58 | > username和ip分别为登陆master机器的username和IP 59 | 3. 将本地`8080`绑定到远程`80`端口 60 | `ssh -N -f -L 8080:127.0.0.1:80 username@ip` 61 | > username和ip分别为登陆master机器的username和IP 62 | 4. 将本地`4040`绑定到远程`4040`端口 63 | `ssh -N -f -L 4040:127.0.0.1:4040 username@ip` 64 | > username和ip分别为登陆master机器的username和IP 65 | 66 | #### windows下 Putty的配置 67 | 1. 将本地 8081 绑定到远程 80 端口 68 | 结合上图切换到 Tunnel对应的选项卡: 69 | * source port填写 8081 70 | * Destination 填写 127.0.0.1:80 71 | 2. 将本地 80 绑定到远程 80 端口 72 | 结合上图切换到 Tunnel对应的选项卡: 73 | * source port填写 80 74 | * Destination 填写 127.0.0.1:80 75 | 3. 将本地 8080 绑定到远程 80 端口 76 | 结合上图切换到 Tunnel对应的选项卡: 77 | * source port填写 8080 78 | * Destination 填写 127.0.0.1:80 79 | 4. 将本地 4040 绑定到远程 4040 端口 80 | 结合上图切换到 Tunnel对应的选项卡: 81 | * source port填写 4040 82 | * Destination 填写 127.0.0.1:4040 83 | 84 | #### 将Spark master和所有slave的机器名绑定127.0.0.1 85 | 86 | 如: `127.0.0.1 23 hxs787e` 87 | 88 | * windows hosts文件路径: `C:\Windows\System32\drivers\etc\hosts` 89 | * linux hosts文件路径: `/etc/hosts` 90 | 91 | #### Spark-notebook的配置: 92 | 93 | 1. Linux命令行执行如下命令: 94 | `ssh -N -f -L port1:127.0.0.1:9090 username@ip` 95 | *username和ip分别为登陆master机器的username和IP,port1的值为与本机其他端口不冲突的任意有效值* 96 | 2. windows Putty的配置: 97 | 结合上图切换到 Tunnel对应的选项卡: 98 | * source port填写与本机其他端口不冲突的任意有效值 99 | * Destination 填写 127.0.0.1:9090 100 | 101 | #### 配置Hue 102 | 103 | 1. Linux命令行执行如下命令: 104 | `ssh -N -f -L port1:127.0.0.1:8888 username@ip` 105 | *username和ip分别为登陆master机器的username和IP,port1的值为与本机其他端口不冲突的任意有效值* 106 | 107 | 2. windows Putty的配置 108 | 结合上图切换到 Tunnel对应的选项卡: 109 | * source port填写与本机其他端口不冲突的任意有效值 110 | * Destination 填写 127.0.0.1:8888 -------------------------------------------------------------------------------- /bin/core/config_nginx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import os 4 | import sys 5 | 6 | def do_generate_upstream_server_config(spark_host_info_path): 7 | 8 | format_tab = "\t" 9 | format_tab2 = format_tab*2 10 | up_stream_place_template = format_tab+"upstream server_${hostname} {"+os.linesep + \ 11 | format_tab2 + "server ${host}:${port};" + os.linesep + \ 12 | format_tab + "}" 13 | server_place_holder_template = format_tab+"server {" + os.linesep + \ 14 | format_tab2 + "listen 80;" + os.linesep + \ 15 | format_tab2 + "server_name ${hostname};"+os.linesep + \ 16 | format_tab2 + "location / {" + os.linesep + \ 17 | format_tab2 + " proxy_pass http://server_${hostname};" + os.linesep + \ 18 | format_tab2 + "}"+os.linesep + \ 19 | format_tab + "}" 20 | spark_host_info_file = open(spark_host_info_path) 21 | host_info_lines = spark_host_info_file.readlines()[1:] 22 | 23 | up_stream_str = "" 24 | server_stream_str = "" 25 | 26 | spark_master_host_name = "spark_master" 27 | up_stream_master_item = up_stream_place_template.replace("${hostname}", spark_master_host_name)\ 28 | .replace("${host}", "127.0.0.1")\ 29 | .replace("${port}", "8080").replace("\t", "", 1) 30 | server_stream_master_item = \ 31 | server_place_holder_template.replace("${hostname}", spark_master_host_name).replace("\t", "", 1) 32 | 33 | up_stream_str += up_stream_master_item + os.linesep 34 | server_stream_str += server_stream_master_item+os.linesep 35 | 36 | for host_info in host_info_lines: 37 | host_info_list = host_info.split() 38 | up_stream_item = up_stream_place_template.replace("${hostname}", host_info_list[1].strip()) \ 39 | .replace("${host}", host_info_list[0].strip()) \ 40 | .replace("${port}", "8081") 41 | server_stream_item = server_place_holder_template.replace("${hostname}", host_info_list[1].strip()) 42 | 43 | up_stream_str += up_stream_item.rstrip() + os.linesep 44 | server_stream_str += server_stream_item.rstrip()+os.linesep 45 | return up_stream_str, server_stream_str 46 | 47 | def do_update_nginx_config_file(result_content, nginx_config_target_path): 48 | nginx_config_file = file(nginx_config_target_path, "w") 49 | nginx_config_file.write(result_content) 50 | 51 | def generate_config_file(spark_host_info_path,nginx_config_template_path, nginx_config_taget_path): 52 | 53 | up_stream_place_holder="${upstream_place_holder}" 54 | server_place_holder="${server_place_holder}" 55 | 56 | nginx_upstream_server_tuple = do_generate_upstream_server_config(spark_host_info_path) 57 | 58 | nginx_config_template_file = open(nginx_config_template_path) 59 | nginx_config_template_lines = nginx_config_template_file.readlines() 60 | result_content = "" 61 | for line in nginx_config_template_lines: 62 | result_content += line 63 | 64 | result_content = result_content.replace(up_stream_place_holder, nginx_upstream_server_tuple[0]) \ 65 | .replace(server_place_holder, nginx_upstream_server_tuple[1]) 66 | 67 | do_update_nginx_config_file(result_content, nginx_config_taget_path) 68 | 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Artistic License 2.0 2 | 3 | Copyright (c) 2015 aliyun 4 | 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | This license establishes the terms under which a given free software 11 | Package may be copied, modified, distributed, and/or redistributed. 12 | The intent is that the Copyright Holder maintains some artistic 13 | control over the development of that Package while still keeping the 14 | Package available as open source and free software. 15 | 16 | You are always permitted to make arrangements wholly outside of this 17 | license directly with the Copyright Holder of a given Package. If the 18 | terms of this license do not permit the full use that you propose to 19 | make of the Package, you should contact the Copyright Holder and seek 20 | a different licensing arrangement. 21 | 22 | Definitions 23 | 24 | "Copyright Holder" means the individual(s) or organization(s) 25 | named in the copyright notice for the entire Package. 26 | 27 | "Contributor" means any party that has contributed code or other 28 | material to the Package, in accordance with the Copyright Holder's 29 | procedures. 30 | 31 | "You" and "your" means any person who would like to copy, 32 | distribute, or modify the Package. 33 | 34 | "Package" means the collection of files distributed by the 35 | Copyright Holder, and derivatives of that collection and/or of 36 | those files. A given Package may consist of either the Standard 37 | Version, or a Modified Version. 38 | 39 | "Distribute" means providing a copy of the Package or making it 40 | accessible to anyone else, or in the case of a company or 41 | organization, to others outside of your company or organization. 42 | 43 | "Distributor Fee" means any fee that you charge for Distributing 44 | this Package or providing support for this Package to another 45 | party. It does not mean licensing fees. 46 | 47 | "Standard Version" refers to the Package if it has not been 48 | modified, or has been modified only in ways explicitly requested 49 | by the Copyright Holder. 50 | 51 | "Modified Version" means the Package, if it has been changed, and 52 | such changes were not explicitly requested by the Copyright 53 | Holder. 54 | 55 | "Original License" means this Artistic License as Distributed with 56 | the Standard Version of the Package, in its current version or as 57 | it may be modified by The Perl Foundation in the future. 58 | 59 | "Source" form means the source code, documentation source, and 60 | configuration files for the Package. 61 | 62 | "Compiled" form means the compiled bytecode, object code, binary, 63 | or any other form resulting from mechanical transformation or 64 | translation of the Source form. 65 | 66 | 67 | Permission for Use and Modification Without Distribution 68 | 69 | (1) You are permitted to use the Standard Version and create and use 70 | Modified Versions for any purpose without restriction, provided that 71 | you do not Distribute the Modified Version. 72 | 73 | 74 | Permissions for Redistribution of the Standard Version 75 | 76 | (2) You may Distribute verbatim copies of the Source form of the 77 | Standard Version of this Package in any medium without restriction, 78 | either gratis or for a Distributor Fee, provided that you duplicate 79 | all of the original copyright notices and associated disclaimers. At 80 | your discretion, such verbatim copies may or may not include a 81 | Compiled form of the Package. 82 | 83 | (3) You may apply any bug fixes, portability changes, and other 84 | modifications made available from the Copyright Holder. The resulting 85 | Package will still be considered the Standard Version, and as such 86 | will be subject to the Original License. 87 | 88 | 89 | Distribution of Modified Versions of the Package as Source 90 | 91 | (4) You may Distribute your Modified Version as Source (either gratis 92 | or for a Distributor Fee, and with or without a Compiled form of the 93 | Modified Version) provided that you clearly document how it differs 94 | from the Standard Version, including, but not limited to, documenting 95 | any non-standard features, executables, or modules, and provided that 96 | you do at least ONE of the following: 97 | 98 | (a) make the Modified Version available to the Copyright Holder 99 | of the Standard Version, under the Original License, so that the 100 | Copyright Holder may include your modifications in the Standard 101 | Version. 102 | 103 | (b) ensure that installation of your Modified Version does not 104 | prevent the user installing or running the Standard Version. In 105 | addition, the Modified Version must bear a name that is different 106 | from the name of the Standard Version. 107 | 108 | (c) allow anyone who receives a copy of the Modified Version to 109 | make the Source form of the Modified Version available to others 110 | under 111 | 112 | (i) the Original License or 113 | 114 | (ii) a license that permits the licensee to freely copy, 115 | modify and redistribute the Modified Version using the same 116 | licensing terms that apply to the copy that the licensee 117 | received, and requires that the Source form of the Modified 118 | Version, and of any works derived from it, be made freely 119 | available in that license fees are prohibited but Distributor 120 | Fees are allowed. 121 | 122 | 123 | Distribution of Compiled Forms of the Standard Version 124 | or Modified Versions without the Source 125 | 126 | (5) You may Distribute Compiled forms of the Standard Version without 127 | the Source, provided that you include complete instructions on how to 128 | get the Source of the Standard Version. Such instructions must be 129 | valid at the time of your distribution. If these instructions, at any 130 | time while you are carrying out such distribution, become invalid, you 131 | must provide new instructions on demand or cease further distribution. 132 | If you provide valid instructions or cease distribution within thirty 133 | days after you become aware that the instructions are invalid, then 134 | you do not forfeit any of your rights under this license. 135 | 136 | (6) You may Distribute a Modified Version in Compiled form without 137 | the Source, provided that you comply with Section 4 with respect to 138 | the Source of the Modified Version. 139 | 140 | 141 | Aggregating or Linking the Package 142 | 143 | (7) You may aggregate the Package (either the Standard Version or 144 | Modified Version) with other packages and Distribute the resulting 145 | aggregation provided that you do not charge a licensing fee for the 146 | Package. Distributor Fees are permitted, and licensing fees for other 147 | components in the aggregation are permitted. The terms of this license 148 | apply to the use and Distribution of the Standard or Modified Versions 149 | as included in the aggregation. 150 | 151 | (8) You are permitted to link Modified and Standard Versions with 152 | other works, to embed the Package in a larger work of your own, or to 153 | build stand-alone binary or bytecode versions of applications that 154 | include the Package, and Distribute the result without restriction, 155 | provided the result does not expose a direct interface to the Package. 156 | 157 | 158 | Items That are Not Considered Part of a Modified Version 159 | 160 | (9) Works (including, but not limited to, modules and scripts) that 161 | merely extend or make use of the Package, do not, by themselves, cause 162 | the Package to be a Modified Version. In addition, such works are not 163 | considered parts of the Package itself, and are not subject to the 164 | terms of this license. 165 | 166 | 167 | General Provisions 168 | 169 | (10) Any use, modification, and distribution of the Standard or 170 | Modified Versions is governed by this Artistic License. By using, 171 | modifying or distributing the Package, you accept this license. Do not 172 | use, modify, or distribute the Package, if you do not accept this 173 | license. 174 | 175 | (11) If your Modified Version has been derived from a Modified 176 | Version made by someone other than you, you are nevertheless required 177 | to ensure that your Modified Version complies with the requirements of 178 | this license. 179 | 180 | (12) This license does not grant you the right to use any trademark, 181 | service mark, tradename, or logo of the Copyright Holder. 182 | 183 | (13) This license includes the non-exclusive, worldwide, 184 | free-of-charge patent license to make, have made, use, offer to sell, 185 | sell, import and otherwise transfer the Package with respect to any 186 | patent claims licensable by the Copyright Holder that are necessarily 187 | infringed by the Package. If you institute patent litigation 188 | (including a cross-claim or counterclaim) against any party alleging 189 | that the Package constitutes direct or contributory patent 190 | infringement, then this Artistic License to you shall terminate on the 191 | date that such litigation is filed. 192 | 193 | (14) Disclaimer of Warranty: 194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS 195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED 196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL 198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL 199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL 200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF 201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 202 | 203 | -------------------------------------------------------------------------------- /bin/core/ecs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import sys 4 | import os 5 | import tarfile 6 | import time 7 | import commands 8 | import urllib2 9 | import utils 10 | from sys import stderr 11 | from datetime import datetime 12 | from common import GlobalVar 13 | 14 | def setup_aliyun_sdk(): 15 | lib_dir = os.path.join(GlobalVar.SPARK_ECS_DIR, "lib") 16 | if not os.path.exists(lib_dir): 17 | os.mkdir(lib_dir) 18 | ecs_sdk_lib_dir = os.path.join(lib_dir, "aliyun-sdk") 19 | if not os.path.isdir(ecs_sdk_lib_dir): 20 | tgz_file_path = os.path.join(lib_dir, "aliyun-sdk.tgz") 21 | print "Downloading Aliyun sdk..." 22 | download_stream = urllib2.urlopen(GlobalVar.ALIYUN_SDK_URL) 23 | with open(tgz_file_path, "wb") as tgz_file: 24 | tgz_file.write(download_stream.read()) 25 | tar = tarfile.open(tgz_file_path) 26 | tar.extractall(path=lib_dir) 27 | tar.close() 28 | os.remove(tgz_file_path) 29 | os.system("mv %s/* %s/aliyun-sdk" % (lib_dir, lib_dir)) 30 | print "Finished downloading Aliyun sdk" 31 | sys.path.insert(0, ecs_sdk_lib_dir) 32 | 33 | setup_aliyun_sdk() 34 | import aliyun.api 35 | 36 | def set_secret_key(): 37 | access_id = os.getenv('ALIYUN_ACCESS_ID') 38 | if access_id is None: 39 | print >> stderr, ("ERROR: The environment variable ALIYUN_ACCESS_ID must be set") 40 | sys.exit(1) 41 | access_key = os.getenv('ALIYUN_ACCESS_KEY') 42 | if access_key is None: 43 | print >> stderr, ("ERROR: The environment variable ALIYUN_ACCESS_KEY must be set") 44 | aliyun.setDefaultAppInfo(access_id, access_key) 45 | 46 | set_secret_key() 47 | 48 | def check_aliyun_api_ret_code(response): 49 | if "Code" in response: 50 | print "Fail." 51 | print response['Code'] 52 | print response['Message'] 53 | raise RuntimeError(response['Code'], response['Message']) 54 | 55 | def authorize_security_group_in(group_id, ip_protocol, src_group_id, src_cidr_ip, port_range, opts): 56 | req = aliyun.api.Ecs20140526AuthorizeSecurityGroupRequest() 57 | req.RegionId = opts.region 58 | req.SecurityGroupId = group_id 59 | req.IpProtocol = ip_protocol 60 | req.PortRange = port_range 61 | if src_cidr_ip == "": 62 | req.SourceGroupId = src_group_id 63 | req.NicType = "intranet" 64 | else: 65 | req.SourceCidrIp = src_cidr_ip 66 | req.NicType = "internet" 67 | f = req.getResponse() 68 | check_aliyun_api_ret_code(f) 69 | 70 | def authorize_security_group_out(group_id, ip_protocol, dst_group_id, dst_cidr_ip, port_range, opts): 71 | req = aliyun.api.Ecs20140526AuthorizeSecurityGroupEgressRequest() 72 | req.RegionId = opts.region 73 | req.SecurityGroupId = group_id 74 | req.IpProtocol = ip_protocol 75 | req.PortRange = port_range 76 | if dst_cidr_ip == "": 77 | req.DestGroupId = dst_group_id 78 | req.NicType = "intranet" 79 | else: 80 | req.DestCidrIp = dst_cidr_ip 81 | req.NicType = "internet" 82 | f = req.getResponse() 83 | check_aliyun_api_ret_code(f) 84 | 85 | def get_security_group_rules(security_group_id, opts): 86 | req = aliyun.api.Ecs20140526DescribeSecurityGroupAttributeRequest() 87 | req.SecurityGroupId = security_group_id 88 | req.RegionId = opts.region 89 | f = req.getResponse() 90 | check_aliyun_api_ret_code(f) 91 | permissions = f['Permissions']['Permission'] 92 | return permissions 93 | 94 | def get_all_instances(opts): 95 | page_number = 1 96 | instances = [] 97 | req = aliyun.api.Ecs20140526DescribeInstancesRequest() 98 | req.RegionId = opts.region 99 | req.PageSize = GlobalVar.ECS_API_PAGESIZE 100 | req.PageNumber = page_number 101 | f = req.getResponse() 102 | check_aliyun_api_ret_code(f) 103 | instances += f['Instances']['Instance'] 104 | total_pages = f['TotalCount'] / (GlobalVar.ECS_API_PAGESIZE + 1) + 1 105 | while page_number < total_pages: 106 | page_number += 1 107 | req = aliyun.api.Ecs20140526DescribeInstancesRequest() 108 | req.RegionId = opts.region 109 | req.PageSize = GlobalVar.ECS_API_PAGESIZE 110 | req.PageNumber = page_number 111 | f = req.getResponse() 112 | check_aliyun_api_ret_code(f) 113 | instances += f['Instances']['Instance'] 114 | 115 | return instances 116 | 117 | def get_gateway_instance_info(opts): 118 | ip = commands.getoutput("""ifconfig eth0 | awk 'NR==2 {print $2}' | awk -F'[:]' '{print $2}'""") 119 | all_instances = get_all_instances(opts) 120 | for ins in all_instances: 121 | inner_ips = ins['InnerIpAddress']['IpAddress'] 122 | public_ips = ins['PublicIpAddress']['IpAddress'] 123 | if ip in inner_ips + public_ips: 124 | return ins 125 | raise RuntimeError('Could find instance information of the current gateway.') 126 | 127 | def clear_security_group_rules(group_id, opts): 128 | security_group_rules = get_security_group_rules(group_id, opts) 129 | for rule in security_group_rules: 130 | if rule['SourceGroupId'] != "" or rule['SourceCidrIp'] != "": 131 | req = aliyun.api.Ecs20140526RevokeSecurityGroupRequest() 132 | req.SourceGroupId = rule['SourceGroupId'] 133 | req.SourceCidrIp = rule['SourceCidrIp'] 134 | else: 135 | req = aliyun.api.Ecs20140526RevokeSecurityGroupEgressRequest() 136 | req.DestGroupId = rule['DestGroupId'] 137 | req.DestCidrIp = rule['DestCidrIp'] 138 | req.SecurityGroupId = group_id 139 | req.RegionId = opts.region 140 | req.IpProtocol = rule['IpProtocol'] 141 | req.PortRange = rule['PortRange'] 142 | f = req.getResponse() 143 | check_aliyun_api_ret_code(f) 144 | 145 | def launch_instance(opts, cluster_name, role, ami, instance_type, security_group_id, instance_name, 146 | internet_band_out, host_name, pass_word, open_public_ip=False): 147 | req = aliyun.api.Ecs20140526CreateInstanceRequest() 148 | req.RegionId = opts.region 149 | req.ImageId = ami 150 | req.InstanceType = instance_type 151 | req.SecurityGroupId = security_group_id 152 | req.InstanceName = instance_name 153 | req.HostName = host_name.replace('_', '-') 154 | req.Password = pass_word 155 | if role == "masters" or open_public_ip: 156 | req.InternetChargeType = "PayByTraffic" 157 | req.InternetMaxBandwidthOut = internet_band_out 158 | else: 159 | req.InternetChargeType = "PayByBandwidth" 160 | req.InternetMaxBandwidthOut = "0" 161 | if opts.disk_size is not None: 162 | req.DataDisk_1_Category = "cloud" 163 | req.DataDisk_1_Device = "/dev/xvdb" 164 | req.DataDisk_1_Size = opts.disk_size 165 | 166 | req2 = aliyun.api.Ecs20140526StartInstanceRequest() 167 | req3 = aliyun.api.Ecs20140526AllocatePublicIpAddressRequest() 168 | 169 | f = req.getResponse() 170 | check_aliyun_api_ret_code(f) 171 | instance_id = f['InstanceId'] 172 | utils.save_masters_or_slaves(cluster_name, role, instance_id) 173 | if open_public_ip: 174 | req3.InstanceId = instance_id 175 | f = req3.getResponse() 176 | check_aliyun_api_ret_code(f) 177 | req2.InstanceId = instance_id 178 | f = req2.getResponse() 179 | check_aliyun_api_ret_code(f) 180 | return instance_id 181 | 182 | def release_ecs_instance(instance_ids): 183 | print("Terminating masters and slaves...") 184 | start_time = datetime.now() 185 | 186 | print "==> Checking cluster status. We can do noting before the cluster enter `Running` status..." 187 | utils.wait_for_cluster_state(['Running', 'Stopping', 'Stopped'], instance_ids) 188 | print "==> Checked OK..." 189 | 190 | need_to_stop = [] 191 | need_to_release = [] 192 | for ins in instance_ids: 193 | instance_info = get_instance_info(ins) 194 | status = instance_info['Status'] 195 | if status in ['Running']: 196 | need_to_stop.append(ins) 197 | need_to_release.append(ins) 198 | elif status in ['Stopped', 'Stopping']: 199 | need_to_release.append(ins) 200 | 201 | for ins in need_to_stop: 202 | try: 203 | req = aliyun.api.Ecs20140526StopInstanceRequest() 204 | req.InstanceId = ins 205 | f = req.getResponse() 206 | check_aliyun_api_ret_code(f) 207 | except Exception, e: 208 | print e 209 | raise e 210 | 211 | retries = 0 212 | while True: 213 | time.sleep(5) 214 | all_released = True 215 | 216 | for ins in need_to_release: 217 | try: 218 | instance_info = get_instance_info(ins) 219 | if instance_info['Status'] == "Stopped": 220 | req2 = aliyun.api.Ecs20140526DeleteInstanceRequest() 221 | req2.InstanceId = ins 222 | f = req2.getResponse() 223 | check_aliyun_api_ret_code(f) 224 | elif instance_info['Status'] in ["Running", "Stopping"]: 225 | all_released = False 226 | except Exception, e: 227 | print >> stderr, "Error releasing ECS instance, retrying later." 228 | time.sleep(5) 229 | if retries >= 10: 230 | raise e 231 | 232 | if all_released: 233 | break 234 | 235 | retries += 1 236 | 237 | end_time = datetime.now() 238 | print "Cluster instances have been released successfully. Waited {t} seconds.".format( 239 | t=(end_time - start_time).seconds 240 | ) 241 | 242 | def get_all_instances_status(instances): 243 | all_instances_status = [] 244 | for ins in instances: 245 | instance_status = get_instance_info(ins)['Status'] 246 | all_instances_status.append(instance_status) 247 | return all_instances_status 248 | 249 | def get_instance_info(instance_id): 250 | req = aliyun.api.Ecs20140526DescribeInstanceAttributeRequest() 251 | req.InstanceId = instance_id 252 | f = req.getResponse() 253 | check_aliyun_api_ret_code(f) 254 | return f 255 | 256 | -------------------------------------------------------------------------------- /doc/manual.md: -------------------------------------------------------------------------------- 1 | # Spark On ECS 2 | v0.2 3 | 2015.6.30 4 | 5 | 6 | ## Prepare 7 | ------------- 8 | ### 三种工作模式 9 | 脚本工作在三种不同的模式下,下面会介绍三种不同的模式: 10 | * cluster + gateway exclude模式 11 | 需要先申请一台具有公网访问能力的ECS机器作为gateway,然后脚本会自动创建一个新的master和多台slaves,最终这个gateway机器不会成为集群的一部分。 12 | * cluster + gateway include模式 13 | 需要先申请一台具有公网访问能力的ECS机器作为gateway。这台机器会作为集群的master存在,脚本会创建其余的slaves。 14 | * client 模式 15 | 用户可以自行在ECS的购买页面上先行购买好所有的机器,(但是需要使用我们的Spark环境的镜像,此外机器的密码目前需要都一样)。然后在其中一台具有公网访问能力的机器上配置机器信息的配置文件,脚本会读取配置并负责环境的启动。 16 | 17 | 18 | ## Quick Start 19 | 20 | #### cluster + gateway exclude模式 21 | ------------- 22 | 23 | ### 1. 选购Gateway 24 | 在[阿里云ECS](http://www.aliyun.com/product/ecs/)购买一台ECS实例作为Gateway,用来执行自动化部署脚本。 25 | 26 | * Gateway需要配置公网IP,默认不作为Spark集群的一部分,Gateway可以用低配 27 | * Gateway所在地域默认为spark cluster的地域(Region) 28 | 29 | ### 2. 配置环境变量 30 | 31 | 从[AccessKey管理](https://ak-console.aliyun.com/#/accesskey)获得阿里云API公钥密钥。在gateway上配置环境变量: 32 | *ALIYUN_ACCESS_ID*和*ALIYUN_ACCESS_KEY* 33 | 34 | ``` 35 | export ALIYUN_ACCESS_ID=HAxxxxxxxxxx2 36 | export ALIYUN_ACCESS_KEY=JAxxxxxxxxxxxxxxxxxxxxxxxxxs 37 | ``` 38 | *考虑到安全性, 推荐每次登陆时在当前会话中设置环境变量; 出于方便也可以在.bash_profile中配置(不推荐)* 39 | 40 | ### 3. 执行脚本, 启动spark集群 41 | 42 | - 在geteway上执行: **`python spark_ecs.py --mode=cluster -t ecs.s2.large launch spark-test`** 43 | 44 | - 购买前会有一个Check List,列出您购买的ECS实例配置和个数,如下: 45 | 46 | ``` 47 | +--------------------------------------------------------+ 48 | + Check List + 49 | +--------------------------------------------------------+ 50 | 51 | Running Mode: cluster 52 | 53 | Master Instance: 54 | Number: 1 55 | Region: cn-hangzhou 56 | Zone: cn-hangzhou-d 57 | Cores: 2 58 | Memory: 4G 59 | InstanceType: ecs.s2.large 60 | InternetChargeType: PayByTraffic 61 | InternetMaxBandwidthOut: 2 62 | 63 | 64 | Slave Instance: 65 | Number: 1 66 | Region: cn-hangzhou 67 | Zone: cn-hangzhou-d 68 | Cores: 2 69 | Memory: 4G 70 | InstanceType: ecs.s2.large 71 | InternetChargeType: PayByBandwidth 72 | InternetMaxBandwidthOut: 0 73 | +--------------------------------------------------------+ 74 | 75 | ``` 76 | 这里会看到所有的生成的实例的信息,比如 77 | 78 | * Number 对应节点的数量 79 | * Region 表示所在的region 80 | * Zone 所在的zone 81 | * Image 使用的镜像的id 82 | * Cores 机器的核数的配置,目前所有的master和slaves的配置都是一样的 83 | * Memory 使用的内容,目前所有的master和slaves的配置都是一样的 84 | * InstanceType 这个是ECS的官方机型缩略代号 85 | * SecurityGroup 机器所在的安全组,一般同一个集群的会在同一个安全组内 86 | * InternetChargeType 公网流量的付费方式,按量和按带宽 87 | * InternetMaxBandwidthOut 带宽大小 88 | 89 | 90 | 启动完,会打印出Spark集群所有服务的简要信息,如下: 91 | 92 | ``` 93 | +--------------------------------------------------------+ 94 | + Spark Cluster Started Successfully! + 95 | +--------------------------------------------------------+ 96 | The Spark Cluster Configuration listed as following: 97 | 98 | Spark Cluster: 99 | 100 | Spark UI: http://xxx.xxx.xxx.xxx:8080 101 | Master URL: spark://spark-test-master:7077 102 | 103 | +--------------------------------------------------------+ 104 | ``` 105 | - 到这里Spark Cluster就完全起来了, 下面可以愉快的跑spark任务了。 106 | 107 | ### 4. Spark Sample Test 108 | - 登陆到spark master: *ssh xxx.xxx.xxx.xxx*, master ip可以根据上面的成功启动的信息里面找到 109 | - 执行: `/opt/spark/bin/run-example SparkPi`, 测试spark任务能否跑成功。 110 | 111 | ### 5. 停止Spark Cluster和释放ECS 112 | 登陆到gateway上: 113 | 114 | * 停止spark cluster: `spark_ecs.py --mode=cluster stop spark-test` 115 | * 启动spark cluster: `spark_ecs.py --mode=cluster start spark-test` 116 | * 释放ECS资源: `spark_ecs.py --mode=cluster destroy spark-test` 117 | 118 | ## cluster + gateway include模式 119 | 基本上cluster gateway exclude模式一样,以下的几部需要注意 120 | ### 3. 执行脚本, 启动spark集群 121 | 122 | - 执行: **`python spark_ecs.py --mode=cluster --include-gateway -t ecs.s2.large launch spark-test`** 123 | - 需要注意的是新申请的slaves的密码需要和已有的master一致 124 | 125 | ### 4. Spark Sample Test 126 | - 由于本机就是master,所以可以直接执行: `/opt/spark/bin/run-example SparkPi`, 测试spark任务能否跑成功。 127 | 128 | ### 5. 停止Spark Cluster和释放ECS 129 | 由于本机就是master,直接在master机器上执行 130 | * 停止spark cluster: `spark_ecs.py --mode=cluster stop spark-test` 131 | * 启动spark cluster: `spark_ecs.py --mode=cluster start spark-test` 132 | * 释放ECS资源: `spark_ecs.py --mode=cluster destroy spark-test` 133 | 134 | ## client模式 135 | ### 1. 选购集群机器 136 | 不再需要选购gateway,取而代之的是,需要在ECS购买页面上购买好所有的机器,包括master和slaves 137 | 138 | ### 2. 配置环境变量 139 | 从[AccessKey管理](https://ak-console.aliyun.com/#/accesskey)获得阿里云API公钥密钥。在master上配置环境变量: *ALIYUN_ACCESS_ID*和*ALIYUN_ACCESS_KEY* 140 | 141 | ``` 142 | export ALIYUN_ACCESS_ID=HAxxxxxxxxxx2 143 | export ALIYUN_ACCESS_KEY=JAxxxxxxxxxxxxxxxxxxxxxxxxxs 144 | ``` 145 | *考虑到安全性, 推荐每次登陆时在当前会话中设置环境变量; 出于方便也可以在.bash_profile中配置(不推荐)* 146 | 147 | ### 3. 脚本下载 148 | 149 | 下载地址: [此处](), 将脚本拷贝到master任意目录下,例如`$HOME/spark` 150 | 并在脚本目录下创建master和slaves文件 151 | master内将要作为master机器的instance id写进去 152 | 一行一个id,类似 153 | ``` 154 | i-m32135678d 155 | ``` 156 | slaves内将要作为slaves机器的instance id(instance id 可以在ECS的实例列表上看到。)写进去 157 | 一行一个id,类似 158 | ``` 159 | i-m12563538d 160 | i-m12332678d 161 | i-m46745678d 162 | ``` 163 | 164 | ### 4. 执行脚本, 启动spark集群 165 | 166 | - 在master上执行: **`python spark_ecs.py --mode=client launch spark-test`** 167 | 168 | ### 6. 停止Spark Cluster和释放ECS 169 | 在master上: 170 | 171 | * 停止spark cluster: `spark_ecs.py --mode=client stop spark-test` 172 | * 启动spark cluster: `spark_ecs.py --mode=client start spark-test` 173 | * 释放ECS资源: `spark_ecs.py --mode=client destroy spark-test` 174 | 175 | **脚本的更多参数设置见下面的用户手册。** 176 | 177 | ## Manual 178 | ------------- 179 | 180 | **Usage: `spark-ecs [options] [:]`** 181 | **``可以是: launch, destroy, stop, start, enable, disable** 182 | **``可以是: hdfs, hue, spark-notebook** 183 | 184 | 启动Spark集群格式: `python spark-ecs.py -t -i -s -p launch ` 185 | 186 | 例如: `python spark-ecs.py -t ecs.s2.large -i m-xxxxxxx5j -s 2 -p xxxxxx launch test` 187 | 188 | 启动单独服务格式:`python spark-ecs.py enable :` 189 | 190 | 例如: `python spark-ecs.py enable test:hdfs` 191 | 192 | ### 命令描述 193 | 194 | 命令 | 参数 | 描述 195 | ----|---- | ---- 196 | launch|集群名字|创建并启动一个Spark集群 197 | destroy|集群名字|销毁Spark集群,并释放集群中所有ECS实例, **集群数据将无法恢复,请及时转移重要数据**。销毁后集群ECS实例将停止收取相关费用 198 | stop|集群名字|停止Saprk集群,集群实例不会被释放,集群中数据不会丢失。**集群ECS实例将继续收取相关费用** 199 | start|集群名字|再次启动Spark集群 200 | enable|子服务名|启用一个子服务,例如hdfs,hue或者spark-notebook 201 | disable|子服务名|关闭一个子服务,例如hdfs,hue或者spark-notebook 202 | 203 | 脚本执行完会打印出: 204 | 205 | * Spark UI地址: `http://:8080` 206 | * Spark Master: `spark://:7077` 207 | * Spark Notebook(可选): `http://:9090` 208 | * Hue(可选): `http://:8888` 209 | 210 | 访问Spark UI检查所有的slave节点是否正常启动。Spark UI的使用方式见下面的说明。 211 | 212 | ### 脚本参数说明 213 | 214 | 运行`python spark-ecs.py --help`查看使用帮助。以下列出主要的配置项说明: 215 | 216 | | 参数 |缩写| 要求 | 默认值 | 描述 | 可用模式 | 217 | | ------------ | --- | ------------- | ------------ | ------------ | ----- | 218 | |`--instance-type=`|-t|可选|无|配置所要创建的ECS实例类型. 更多类型见: [实例资源规格对照表](http://docs.aliyun.com/?spm=5176.730001.3.16.5mmF39#/pub/ecs/open-api/appendix&instancetype)|clueter模式有效| 219 | | `--mode=`|-m|可选|cluster|运行模式。可选有client模式和cluster模式。client模式是使用已有ECS实例;cluster模式是创建新的ECS实例 | - | 220 | | `--pwd=` |-p|可选|无|配置Spark集群中每个ECS实例的默认密码|clueter模式有效| 221 | | `--ami=`|无|可选|无|配置阿里云ECS机器镜像ID|clueter模式有效| 222 | |`--slaves=`|-s|可选|1|配置Spark集群中Slave节点数|clueter模式有效| 223 | |`--ibo=`|无|可选|2MB|配置实例的流出的带宽上限,计费以发生的公共网络流量为依据|clueter模式有效| 224 | |`--region=`|-r|必选|无|配置ECS实例所属的Region. 注意:**Spark集群的ECS实例Region需要和login机器Region保持一致**|clueter模式有效| 225 | |`--zone=`|-z|可选|*cn-hangzhou-d*|配置ECS实例所属可用区|clueter模式有效| 226 | |`--include-gateway`|无 |可选|不包含|是否将当前登录机器包含进Spark集群|clueter模式有效| 227 | |`--enable-slave-public-ip`|无|可选|不配置|是否配置Spark Slave节点的公网IP|clueter模式有效| 228 | |`--enable-hdfs`|无|可选|不开启|是否打开HDFS服务|两种模式有效| 229 | |`--enable-hue`|无|可选|不开启|是否打开HUE服务|两种模式有效| 230 | |`--enable-spark-notebook`|无|可选|不开启|是否打开Spark Notebook服务|两种模式有效| 231 | 232 | **注意点:** 233 | 234 | 1. client模式时,一些参数无效,请注意每个参数的可用模式 235 | 2. cluster模式时,您可以选择是否将当前login机器加入到Spark集群中,详见`--include-gateway`参数。 236 | 3. 不同可用区之间的数据传输需要收取公网流量费用:**¥0.8/GB** 237 | 238 | ### 模式参数说明 239 | 240 | 1. cluster模式 241 | ECS实例的申请,集群和服务的启动完全通过脚本完成。 242 | 2. client模式 243 | 基于用户已有ECS实例,完成集群和服务的启动。 244 | 通过阿里云的售卖页面完成ECS实例的购买可以更加直观地获得费用信息。client模式需要提供两个文件`masters`和`slaves`,分别包含Master节点和Slave节点的实例ID,即`InstanceId`。可以在[ECS控制台](https://console.aliyun.com/ecs/index.htm)查看每个ECS实例的`InstanceId`。 245 | 246 | **注意点:** 247 | 248 | 1. `masters`和`slaves`必须和脚本本放在同一目录中 249 | 2. 使用client模式时,您需要注意购买ECS实例时选择我们提供的镜像并设置相同的默认密码,具体可参考[Spark镜像列表](https://github.com/aliyun/spark-on-ecs/tree/master/ecs-image-list)。 250 | 251 | 252 | ## Spark相关 253 | 254 | ### Spark UI 255 | 目前提供两种方式支持Spark UI,即SSH隧道和公网开放式两种。 256 | 257 | 1. SSH隧道:通过在PC和Spark master节点之间的SSH隧道建立连接。这种方式安全性将会高一些,但需要您做一定的配置工作。具体操作过程请详见[SSH隧道使用指引](https://github.com/aliyun/spark-on-ecs/tree/master/doc/ssh_tunnel.md)。 258 | 2. 公网开放式:这种方式需要您在购买ECS实例时配置一个公网IP。这种方式会额外打开一些端口,例如8080,8081,9090等,安全性比SSH方式低,但使用上更加方便。 259 | - 脚本执行完, 会在当前目录创建Spark集群的Hosts列表文件,请把这个文件内容拷贝到本机的hosts文件中。Windows用户请编辑`C:\Windows\System32\drivers\etc\hosts`文件,Linux用户请编辑`\etc\hosts`文件 260 | - 由于每次创建集群的机器名和公网IP都会发生变化,所以一旦销毁集群请及时清除本机中相关的Hosts修改 261 | 262 | **注意:** 建议使用SSH隧道方式。 263 | 264 | ### Spark Notebook 265 | 266 | Spark Notebook提供一种交互式的编程方式,您可以在上面进行Spark程序开发。更多信息请关注[Spark Notebook](https://github.com/andypetrella/spark-notebook)的最新进展。 267 | 268 | ### Hue 269 | 270 | Hue是一种开源的进行大数据分析的Web平台。更多信息请关注[Cloudera-Hue](https://github.com/cloudera/hue)的最新进展。 271 | 272 | ### 默认配置文件 273 | 274 | 本脚执行时会动态修改一些软件的配置文件。这些软件的默认配置文件放置在/root/.config目录下: 275 | 276 | 1. packages.property文件:配置每个软件的安装路径 277 | 2. hadoop目录:hadoop配置文件目录,包含core-site.xml,hdfs-site.xml以及hadoop-env.sh 278 | 3. hue目录:Hue配置文件目录,包含hue.ini 279 | -------------------------------------------------------------------------------- /bin/spark_ecs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import sys 4 | import os 5 | import getpass 6 | from service import hdfs, hue, spark, spark_notebook 7 | from core import utils, ecs 8 | from core.common import GlobalVar 9 | from sys import stderr 10 | from optparse import OptionParser 11 | 12 | utils.welcome() 13 | 14 | class UsageError(Exception): 15 | pass 16 | 17 | def parse_args(): 18 | parser = OptionParser( 19 | prog="spark-ecs", 20 | usage="%prog [options] [:]\n\n" 21 | + " can be: launch, destroy, stop, start, enable, disable \n" 22 | + " can be anything you want \n" 23 | + " can be: hdfs, hue, spark-notebook") 24 | parser.add_option( 25 | "-m", '--mode', type="string", 26 | help="There are two modes, i.e. `client` and `cluster`. " + 27 | "In `client` mode, you need to buy ECS instances firstly, and then provide `masters` file listing " + 28 | "Spark master `InstanceId` and `slaves` listing Spark slave `InstanceId`. In `cluster` mode, you " + 29 | "can create ECS instances and start Spark cluster through this script.") 30 | parser.add_option( 31 | "-p", '--pwd', type="string", help="User password for each ECS instance.") 32 | parser.add_option( 33 | "-s", "--slaves", type="int", default=1, help="Number of slaves to launch (default: %default)") 34 | parser.add_option( 35 | "-d", "--disk-size", type="int", help="Size (in GB) of each ECS data disk") 36 | parser.add_option( 37 | "--ibo", type="string", default="2", help="Internet bandwidth out") 38 | parser.add_option( 39 | "-t", "--instance-type", type="string", help="Type of instance to launch.") 40 | parser.add_option( 41 | "-r", "--region", type="string", help="ECS region to launch instances in") 42 | parser.add_option( 43 | "-z", "--zone", type="string", 44 | help="Availability zone to launch instances in, or 'all' to spread " + 45 | "slaves across multiple (an additional RMB 0.8/Gb for bandwidth" + 46 | "between zones applies) (default: a single zone chosen at random)") 47 | parser.add_option("-i", "--ami", help="Aliyun Machine Image ID to use") 48 | parser.add_option( 49 | "-u", "--user", default="root", 50 | help="The SSH user you want to connect as (default: %default)") 51 | parser.add_option( 52 | "--authorized-address", type="string", default="0.0.0.0/0", 53 | help="Address to authorize on created security groups (default: %default)") 54 | parser.add_option( 55 | "--include-gateway", action="store_true", default=False, 56 | help="Whether to put current login machine into Spark Cluster." 57 | ) 58 | parser.add_option( 59 | "--enable-slave-public-ip", action="store_true", default=False, 60 | help="Whether to allocate a public network IP for Spark master." 61 | ) 62 | parser.add_option( 63 | "--enable-hdfs", action="store_true", default=False, 64 | help="Whether to launch a HDFS service" 65 | ) 66 | parser.add_option( 67 | "--enable-spark-notebook", action="store_true", default=False, 68 | help="Launch a spark-notebook. More information: https://github.com/andypetrella/spark-notebook" 69 | ) 70 | parser.add_option( 71 | "--enable-hue", action="store_true", default=False, 72 | help="Launch a Hue web Service" 73 | ) 74 | 75 | (opts, command) = parser.parse_args() 76 | if len(command) != 2: 77 | parser.print_help() 78 | print "\nYou need to provide a [:]\n" 79 | sys.exit(1) 80 | (action, name) = command 81 | if action in ["launch", "stop", "start", "destroy"]: 82 | GlobalVar.CLUSTER_HOSTS = name + "-hosts" 83 | 84 | return opts, action, name 85 | 86 | def launch_in_cluster_mode(cluster_name, opts): 87 | # check cluster status trickly 88 | if utils.check_cluster_status(cluster_name, ['Running', 'Stopped']): 89 | print "Cluster %s has been launched, please `Destroy` it first." % cluster_name 90 | sys.exit(1) 91 | do_validity_check(opts) 92 | 93 | if opts.slaves <= 0: 94 | print >> stderr, "ERROR: You have to start as least 1 slave" 95 | sys.exit(1) 96 | (masters, slaves, master_ip) = utils.launch_cluster(opts, cluster_name) 97 | utils.wait_for_cluster_state( 98 | cluster_state=['Running'], 99 | instances=masters + slaves) 100 | utils.mount_disk(masters, slaves, opts) 101 | spark.setup_cluster(masters, slaves, opts, True) 102 | if opts.enable_spark_notebook: 103 | spark_notebook.start_spark_notebook(masters, opts) 104 | if opts.enable_hue: 105 | hue.start_hue(masters, opts) 106 | if opts.enable_hdfs: 107 | hdfs.setup_hdfs(masters, slaves, opts) 108 | if opts.enable_slave_public_ip: 109 | utils.save_public_ips(masters, slaves) 110 | utils.open_nginx(opts, masters) 111 | 112 | utils.end_of_startup(opts, master_ip, masters) 113 | # update cluster status 114 | os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 115 | 116 | def destroy_in_cluster_mode(cluster_name, opts): 117 | do_validity_check(opts) 118 | print "Are you sure you want to destroy the cluster %s?" % cluster_name 119 | print "The following instances will be terminated:" 120 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name) 121 | if len(masters + slaves) <= 0: 122 | print "There is no master or slave, check it first please." 123 | sys.exit(1) 124 | instances = masters + slaves 125 | gateway = ecs.get_gateway_instance_info(opts)['InstanceId'] 126 | if gateway in instances: 127 | instances.remove(gateway) 128 | 129 | to_release = [] 130 | for ins in instances: 131 | try: 132 | instance_info = ecs.get_instance_info(ins) 133 | to_release.append(ins) 134 | print "> %s" % (instance_info['HostName']) 135 | except Exception, e: 136 | if 'InvalidInstanceId.NotFound' in e.args: 137 | print "> %s, invalid `InstanceId` not found, skip it." % ins 138 | else: 139 | raise e 140 | 141 | utils.warning() 142 | msg = "All data on all nodes will be lost!!\nYou'd better stop it first. " \ 143 | "Destroy cluster %s (Y/n): " % cluster_name 144 | to_destroy = raw_input(msg) 145 | if to_destroy == "Y": 146 | try: 147 | ecs.release_ecs_instance(to_release) 148 | except Exception, e: 149 | print e, "\nReleasing ECS instances failed for some unknown reasons, " \ 150 | "you can do it through: https://console.aliyun.com/ecs/index.htm" 151 | raise e 152 | finally: 153 | utils.delete_file_safely(GlobalVar.CLUSTER_STATUS + cluster_name) 154 | utils.delete_file_safely(GlobalVar.CLUSTER_INSTANCES + cluster_name) 155 | utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS) 156 | utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public") 157 | else: 158 | print "Not `Y`, give up destroying cluster %s" % cluster_name 159 | 160 | def stop_in_cluster_mode(cluster_name, opts): 161 | # check cluster status trickly 162 | if utils.check_cluster_status(cluster_name, ['Stopped']): 163 | print "Cluster %s has been `Stopped`, you can not stop it again." % cluster_name 164 | sys.exit(1) 165 | do_validity_check(opts) 166 | 167 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name) 168 | if len(masters + slaves) <= 0: 169 | print "There is no master or slave running, check it first please." 170 | sys.exit(1) 171 | 172 | print "==> Stopping Spark cluster..." 173 | utils.warning() 174 | msg = "Stopping Spark cluster will stop HDFS, spark-notebook and Hue at the same time. " \ 175 | "Stop it? (Y/n): " 176 | to_stop = raw_input(msg) 177 | if to_stop == "Y": 178 | if opts.pwd == "": 179 | opts.pwd = getpass.getpass("You need to provide the password for ECS instance:") 180 | spark.stop_spark_cluster(masters, slaves, opts) 181 | hdfs.stop_hdfs(masters, slaves, opts) 182 | hue.stop_hue(masters, opts) 183 | spark_notebook.stop_spark_notebook(masters, opts) 184 | utils.stop_nginx(opts,masters) 185 | # update cluster status 186 | os.system("echo Stopped > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 187 | else: 188 | print "Not `Y`, give up stopping cluster %s" % cluster_name 189 | 190 | def start_in_cluster_mode(cluster_name, opts): 191 | # check cluster status trickly 192 | if utils.check_cluster_status(cluster_name, ['Running']): 193 | print "Cluster %s is `Running`, please `Stop` it first." % cluster_name 194 | sys.exit(1) 195 | do_validity_check(opts) 196 | 197 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name) 198 | if len(masters + slaves) <= 0: 199 | print "There is no master or slave, check it first please." 200 | sys.exit(1) 201 | 202 | print "==> Restarting spark cluster..." 203 | if opts.pwd == "": 204 | opts.pwd = getpass.getpass("You need to provide the password for ECS instance:") 205 | master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0] 206 | spark.start_spark_cluster(masters[0], slaves, opts) 207 | if opts.enable_spark_notebook: 208 | spark_notebook.start_spark_notebook(masters, opts) 209 | if opts.enable_hue: 210 | hue.start_hue(masters, opts) 211 | if opts.enable_hdfs: 212 | hdfs.setup_hdfs(masters, slaves, opts) 213 | if opts.enable_slave_public_ip: 214 | utils.save_public_ips(masters, slaves) 215 | utils.open_nginx(opts, masters) 216 | 217 | utils.end_of_startup(opts, master_ip, masters) 218 | # update cluster status 219 | os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 220 | 221 | def launch_in_client_mode(cluster_name, opts): 222 | # check cluster status trickly 223 | if utils.check_cluster_status(cluster_name, ['Running', 'Stopped']): 224 | print "Cluster %s has been launched, please `Destroy` it first." % cluster_name 225 | sys.exit(1) 226 | do_validity_check(opts) 227 | 228 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode) 229 | if len(masters) <= 0: 230 | print >> stderr, "ERROR: You have to start as least 1 master" 231 | sys.exit(1) 232 | if len(slaves) <= 0: 233 | print >> stderr, "ERROR: You have to start as least 1 slave" 234 | sys.exit(1) 235 | 236 | # Now we only support single-node master. 237 | spark.setup_cluster(masters, slaves, opts, True) 238 | if opts.enable_spark_notebook: 239 | spark_notebook.start_spark_notebook(masters, opts) 240 | if opts.enable_hue: 241 | hue.start_hue(masters, opts) 242 | if opts.enable_hdfs: 243 | hdfs.setup_hdfs(masters, slaves, opts) 244 | if opts.enable_slave_public_ip: 245 | utils.save_public_ips(masters, slaves) 246 | master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0] 247 | 248 | utils.open_nginx(opts, masters) 249 | utils.end_of_startup(opts, master_ip, masters) 250 | # update cluster status 251 | os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 252 | 253 | def destroy_in_client_mode(cluster_name, opts): 254 | do_validity_check(opts) 255 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode) 256 | if len(masters + slaves) <= 0: 257 | print "There is no master or slave, check it first please." 258 | sys.exit(1) 259 | 260 | print "Are you sure you want to destroy the cluster %s?" % cluster_name 261 | print "The following instances will be terminated:" 262 | instances = masters + slaves 263 | gateway = ecs.get_gateway_instance_info(opts)['InstanceId'] 264 | if gateway in instances: 265 | instances.remove(gateway) 266 | to_release = [] 267 | for ins in instances: 268 | try: 269 | instance_info = ecs.get_instance_info(ins) 270 | to_release.append(ins) 271 | print "> %s" % (instance_info['HostName']) 272 | except Exception, e: 273 | if 'InvalidInstanceId.NotFound' in e.args: 274 | print "> %s, invalid `InstanceId` not found, skip it." % ins 275 | else: 276 | raise e 277 | 278 | utils.warning() 279 | msg = "All data on all nodes will be lost!!\nYou'd better stop it first. " \ 280 | "Destroy cluster %s (Y/n): " % cluster_name 281 | to_destroy = raw_input(msg) 282 | if to_destroy == "Y": 283 | try: 284 | ecs.release_ecs_instance(to_release) 285 | except Exception, e: 286 | print e, "Releasing ECS instances failed for some unknown reasons, " \ 287 | "you can do it through: https://console.aliyun.com/ecs/index.htm" 288 | raise e 289 | finally: 290 | utils.delete_file_safely(GlobalVar.CLUSTER_STATUS + cluster_name) 291 | utils.delete_file_safely(GlobalVar.CLUSTER_INSTANCES + cluster_name) 292 | utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS) 293 | utils.delete_file_safely(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public") 294 | else: 295 | print "Not `Y`, give up destroying cluster %s" % cluster_name 296 | sys.exit(1) 297 | 298 | def stop_in_client_mode(cluster_name, opts): 299 | # check cluster status trickly 300 | if utils.check_cluster_status(cluster_name, ['Stopped']): 301 | print "Cluster %s has been `Stopped`, you can not stop it again." % cluster_name 302 | sys.exit(1) 303 | do_validity_check(opts) 304 | 305 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode) 306 | if len(masters + slaves) <= 0: 307 | print "There is no master or slave running, check it first please." 308 | sys.exit(1) 309 | 310 | print "==> Stopping spark cluster..." 311 | utils.warning() 312 | msg = "Stopping Spark cluster will stop HDFS, spark-notebook and Hue at the same time. " \ 313 | "Stop %s? (Y/n): " % cluster_name 314 | to_stop = raw_input(msg) 315 | if to_stop == "Y": 316 | if opts.pwd == "": 317 | opts.pwd = getpass.getpass("You need to provide the password for ECS instance:") 318 | spark.stop_spark_cluster(masters, slaves, opts) 319 | hdfs.stop_hdfs(masters, slaves, opts) 320 | hue.stop_hue(masters, opts) 321 | spark_notebook.stop_spark_notebook(masters, opts) 322 | utils.stop_nginx(opts,masters) 323 | # update cluster status 324 | os.system("echo Stopped > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 325 | else: 326 | print "Not `Y`, give up stopping cluster %s" % cluster_name 327 | 328 | def start_in_client_mode(cluster_name, opts): 329 | # check cluster status trickly 330 | if utils.check_cluster_status(cluster_name, ['Running']): 331 | print "Cluster %s is `Running`, please `Stop` it first." % cluster_name 332 | sys.exit(1) 333 | do_validity_check(opts) 334 | 335 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode) 336 | if len(masters + slaves) <= 0: 337 | print "There is no master or slave, check it first please." 338 | sys.exit(1) 339 | 340 | print "==> Restarting spark cluster..." 341 | if opts.pwd == "": 342 | opts.pwd = getpass.getpass("You need to provide the password for ECS instance:") 343 | spark.start_spark_cluster(masters[0], slaves, opts) 344 | if opts.enable_spark_notebook: 345 | spark_notebook.start_spark_notebook(masters, opts) 346 | if opts.enable_hue: 347 | hue.start_hue(masters, opts) 348 | if opts.enable_hdfs: 349 | hdfs.setup_hdfs(masters, slaves, opts) 350 | if opts.enable_slave_public_ip: 351 | utils.save_public_ips(masters, slaves) 352 | master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0] 353 | utils.open_nginx(opts, masters) 354 | utils.end_of_startup(opts, master_ip, masters) 355 | # update cluster status 356 | os.system("echo Running > %s%s" % (GlobalVar.CLUSTER_STATUS, cluster_name)) 357 | 358 | def enable_module(name, opts): 359 | if len(name.split(":")) != 2: 360 | print "\nYou need to provide a :\n" 361 | sys.exit(1) 362 | cluster_name = name.split(":")[0] 363 | module_name = name.split(":")[1] 364 | do_validity_check(opts) 365 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name) 366 | if module_name == "hdfs": 367 | hdfs.setup_hdfs(masters, slaves, opts) 368 | elif module_name == "hue": 369 | hue.start_hue(masters, opts) 370 | elif module_name == "spark-notebook": 371 | spark_notebook.start_spark_notebook(masters, opts) 372 | else: 373 | print "Now we only support 3 module: hdfs, hue, spark-notebook" 374 | sys.exit(1) 375 | 376 | def disable_module(name, opts): 377 | if len(name.split(":")) != 2: 378 | print "\nYou need to provide a :\n" 379 | sys.exit(1) 380 | cluster_name = name.split(":")[0] 381 | module_name = name.split(":")[1] 382 | do_validity_check(opts) 383 | (masters, slaves) = utils.get_masters_and_slaves(opts.mode, cluster_name) 384 | if module_name == "hdfs": 385 | hdfs.stop_hdfs(masters, slaves, opts) 386 | elif module_name == "hue": 387 | hue.stop_hue(masters, opts) 388 | elif module_name == "spark-notebook": 389 | spark_notebook.stop_spark_notebook(masters, opts) 390 | else: 391 | print "Now we only support 3 module: hdfs, hue, spark-notebook" 392 | sys.exit(1) 393 | 394 | def do_validity_check(opts): 395 | if opts.region is None: 396 | length = len(GlobalVar.ECS_REGION) 397 | print "There are %s regions available, listed as following:\n" % length 398 | for id in range(1, length + 1): 399 | print id, ":", GlobalVar.ECS_REGION["%s" % id] 400 | print 401 | msg = "Please specify the ECS region No. (like 1): " 402 | opts.region = GlobalVar.ECS_REGION[raw_input(msg).strip()] 403 | 404 | if opts.pwd is None: 405 | opts.pwd = getpass.getpass("""You need to provide a password for ECS instance. 406 | If `CLIENT` mode, you just need to provide login machine's password. 407 | If `CLUSTER` mode and `--include-gateway`, you just need to provide login machine's password. 408 | If `CLUSTER` mode only, you need to set a new default password for each ECS instance. 409 | Please set a password:""") 410 | 411 | def real_main(): 412 | (opts, action, name) = parse_args() 413 | utils.setup_sshpass() 414 | utils.read_properties() 415 | 416 | if opts.mode is None: 417 | msg = "Please specify the running mode, client/cluster: " 418 | opts.mode = raw_input(msg).strip() 419 | 420 | try: 421 | if action == "launch" and opts.mode == "cluster": 422 | launch_in_cluster_mode(name, opts) 423 | elif action == "destroy" and opts.mode == "cluster": 424 | destroy_in_cluster_mode(name, opts) 425 | elif action == "stop" and opts.mode == "cluster": 426 | stop_in_cluster_mode(name, opts) 427 | elif action == "start" and opts.mode == "cluster": 428 | start_in_cluster_mode(name, opts) 429 | elif action == "launch" and opts.mode == "client": 430 | launch_in_client_mode(name, opts) 431 | elif action == "destroy" and opts.mode == "client": 432 | destroy_in_client_mode(name, opts) 433 | elif action == "stop" and opts.mode == "client": 434 | stop_in_client_mode(name, opts) 435 | elif action == "start" and opts.mode == "client": 436 | start_in_client_mode(name, opts) 437 | elif action == "enable": 438 | enable_module(name, opts) 439 | elif action == "disable": 440 | disable_module(name, opts) 441 | else: 442 | print "Wrong action or mode or module. We support: \n " \ 443 | "6 actions: launch, stop, start, destroy, enable, disable \n " \ 444 | "2 modes: client and cluster \n " \ 445 | "3 modules: hdfs, hue, spark-notebook" 446 | except RuntimeError as e: 447 | utils.do_rollback() 448 | 449 | def main(): 450 | try: 451 | GlobalVar.SPARK_ECS_DIR = os.path.dirname(os.path.realpath(__file__)) 452 | real_main() 453 | except UsageError, e: 454 | print >> stderr, "\nERROR:\n", 455 | sys.exit(1) 456 | 457 | if __name__ == "__main__": 458 | main() 459 | 460 | -------------------------------------------------------------------------------- /bin/core/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding=utf-8 3 | import sys 4 | import os 5 | import shutil 6 | import pipes 7 | import getpass 8 | import subprocess 9 | import textwrap 10 | import time 11 | import ConfigParser 12 | import ecs 13 | from datetime import datetime 14 | from sys import stderr 15 | from xml.etree import ElementTree as ET 16 | from nginx import start_nginx, do_stop_nginx 17 | from common import GlobalVar 18 | 19 | class UsageError(Exception): 20 | pass 21 | 22 | def setup_sshpass(): 23 | try: 24 | print "==> Checking sshpass installed or not..." 25 | subprocess.check_call(['sshpass', '-V']) 26 | except Exception: 27 | print "Begin to setup sshpass..." 28 | try: 29 | subprocess.check_call(['yum', '-y', 'install', 'sshpass']) 30 | except Exception: 31 | subprocess.check_call(['apt-get', '-y', 'install', 'sshpass']) 32 | 33 | def read_properties(): 34 | if os.path.exists(GlobalVar.PROPERTY_FILE): 35 | cf = ConfigParser.ConfigParser() 36 | cf.read(GlobalVar.PROPERTY_FILE) 37 | GlobalVar.SPARK_INSTALL_DIR = cf.get('path', 'spark') 38 | GlobalVar.SPARK_NOTEBOOK_INSTALL_DIR = cf.get('path', 'spark-notebook') 39 | GlobalVar.HUE_INSTALL_DIR = cf.get('path', 'hue') 40 | GlobalVar.HADOOP_INSTALL_DIR = cf.get('path', 'hadoop') 41 | GlobalVar.HADOOP_CONF_DIR = "%s/etc/hadoop" % GlobalVar.HADOOP_INSTALL_DIR 42 | 43 | def save_masters_or_slaves(cluster_name, machine_type, instance_id): 44 | if instance_id is None: 45 | return 46 | dir = "%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name) 47 | if not os.path.exists(dir): 48 | os.makedirs(dir) 49 | file = "%s/%s" % (dir, machine_type) 50 | if not os.path.exists(file): 51 | f = open(file, 'w') 52 | f.close() 53 | os.system("echo %s >> %s" % (str(instance_id), file)) 54 | 55 | def get_masters_and_slaves(mode, cluster_name=""): 56 | masters = [] 57 | slaves = [] 58 | if mode == "client": 59 | masters_file = "%s/%s" % (GlobalVar.SPARK_ECS_DIR, "masters") 60 | slaves_file = "%s/%s" % (GlobalVar.SPARK_ECS_DIR, "slaves") 61 | else: 62 | masters_file = "%s/%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name, "masters") 63 | slaves_file = "%s/%s/%s" % (GlobalVar.CLUSTER_INSTANCES, cluster_name, "slaves") 64 | 65 | if os.path.exists(masters_file): 66 | f = open(masters_file, 'r') 67 | for line in f.readlines(): 68 | masters.append(line.strip()) 69 | if os.path.exists(slaves_file): 70 | f = open(slaves_file, 'r') 71 | for line in f.readlines(): 72 | slaves.append(line.strip()) 73 | 74 | return masters, slaves 75 | 76 | def match_and_change(property, tag, content): 77 | children = property.getchildren() 78 | if children[0].text == tag: 79 | children[1].text = content 80 | return 81 | 82 | def update_hadoop_configuration(namenode_url): 83 | file = ET.parse(GlobalVar.HADOOP_CONF_DIR + '/core-site.xml') 84 | properties = file.findall('./property') 85 | for property in properties: 86 | match_and_change(property, 'fs.defaultFS', namenode_url) 87 | file.write(GlobalVar.HADOOP_CONF_DIR + '/core-site.xml', encoding="utf-8") 88 | 89 | def ssh_args(): 90 | parts = ['-o', 'StrictHostKeyChecking=no'] 91 | parts += ['-o', 'UserKnownHostsFile=/dev/null'] 92 | parts += ['-o', 'LogLevel=quiet'] 93 | return parts 94 | 95 | def ssh_command(): 96 | return ['ssh'] + ssh_args() 97 | 98 | def scp_command(): 99 | return ['scp', '-r'] + ssh_args() 100 | 101 | def stringify_command(parts): 102 | if isinstance(parts, str): 103 | return parts 104 | else: 105 | return ' '.join(map(pipes.quote, parts)) 106 | 107 | def is_ssh_available(ip, opts, print_ssh_output=True): 108 | 109 | s = subprocess.Popen( 110 | ssh_command() + ['-t', '-t', '-o', 'ConnectTimeout=3', 111 | '%s@%s' % (opts.user, ip), stringify_command('true')], 112 | stdout=subprocess.PIPE, 113 | stderr=subprocess.STDOUT # we pipe stderr through stdout to preserve output order 114 | ) 115 | cmd_output = s.communicate()[0] # [1] is stderr, which we redirected to stdout 116 | 117 | if s.returncode != 0 and print_ssh_output: 118 | # extra leading newline is for spacing in wait_for_cluster_state() 119 | print textwrap.dedent("""\n 120 | Warning: SSH connection error. (This could be temporary.) 121 | Host: {h} 122 | SSH return code: {r} 123 | SSH output: {o} 124 | """).format( 125 | h=ip, 126 | r=s.returncode, 127 | o=cmd_output.strip() 128 | ) 129 | 130 | return s.returncode == 0 131 | 132 | def is_cluster_ssh_available(cluster_instances, opts): 133 | for i in cluster_instances: 134 | instance_info = ecs.get_instance_info(i) 135 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 136 | if not is_ssh_available(ip, opts, True): 137 | return False 138 | else: 139 | return True 140 | 141 | def save_public_ips(masters, slaves): 142 | cluster_hosts = open(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS + "-public", 'w') 143 | 144 | for node in masters + slaves: 145 | instance_info = ecs.get_instance_info(node) 146 | host = instance_info['HostName'] 147 | ip = instance_info['PublicIpAddress']['IpAddress'][0] 148 | cluster_hosts.write(ip + " " + host + "\n") 149 | 150 | cluster_hosts.close() 151 | 152 | def check_cluster_status(cluster_name, status): 153 | if not os.path.exists(GlobalVar.DEFAULT_CONF_DIR + "/status"): 154 | os.mkdir(GlobalVar.DEFAULT_CONF_DIR + "/status") 155 | if not os.path.exists(GlobalVar.CLUSTER_STATUS + cluster_name): 156 | f = open(GlobalVar.CLUSTER_STATUS + cluster_name, "w") 157 | f.close() 158 | return False 159 | f = open(GlobalVar.CLUSTER_STATUS + cluster_name, "r") 160 | stat = f.readline().strip() 161 | return stat in status 162 | 163 | def delete_file_safely(path): 164 | print "deleting %s" % path 165 | if os.path.exists(path): 166 | if os.path.isdir(path): 167 | shutil.rmtree(path) 168 | else: 169 | os.remove(path) 170 | 171 | def launch_cluster(opts, cluster_name): 172 | if opts.pwd == "": 173 | opts.pwd = getpass.getpass("""You need to provide a password for ECS instance. 174 | If `CLIENT` mode, you just need to provide login machine's password. 175 | If `CLUSTER` mode and `--include-gateway`, you just need to provide login machine's password. 176 | If `CLUSTER` mode only, you need to set a new default password for each ECS instance. 177 | Please set a password:""") 178 | 179 | if opts.ami is None: 180 | print "You need to specify an available ECS image, listed as following: \n" 181 | length = len(GlobalVar.AVAILABLE_SAPRK_VERSION) 182 | for idx in range(1, length+1): 183 | id = "%s" % idx 184 | print idx, ': ', GlobalVar.AVAILABLE_SAPRK_VERSION[id] 185 | print 186 | msg = "Please choose an image No. (like: 1): " 187 | id = raw_input(msg) 188 | spark_version = GlobalVar.AVAILABLE_SAPRK_VERSION[id] 189 | opts.ami = GlobalVar.SPARK_IMAGES[(spark_version, opts.region)] 190 | 191 | if opts.instance_type is None: 192 | print "You need to specify the type of ECS instance, listed as following: \n\n" \ 193 | "%-14s: %s" % ("type name", "(cores, memory)") 194 | for instance_type in GlobalVar.ECS_INSTANCE_TYPE: 195 | print "%-14s: %s" % (instance_type, GlobalVar.ECS_INSTANCE_TYPE[instance_type]) 196 | print 197 | msg = "Please choose an ECS instance type (like: ecs.t1.small): " 198 | opts.instance_type = str(raw_input(msg)).strip() 199 | 200 | print "==> Begin to launch Spark cluster..." 201 | print_shopping_list(opts) 202 | print "==> Setting internet security rules..." 203 | current_group_id = ecs.get_gateway_instance_info(opts)['SecurityGroupIds']['SecurityGroupId'][0] 204 | ecs.clear_security_group_rules(current_group_id, opts) 205 | authorized_address = opts.authorized_address 206 | ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '22/22', opts) 207 | ecs.authorize_security_group_out(current_group_id, 'tcp', "", authorized_address, '1/65535', opts) 208 | if opts.enable_slave_public_ip: 209 | ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '8080/8080', opts) 210 | ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '8081/8081', opts) 211 | ecs.authorize_security_group_in(current_group_id, 'tcp', "", authorized_address, '9000/9000', opts) 212 | 213 | print "==> Launching master and slaves..." 214 | # Launch slaves 215 | master_instances = [] 216 | slave_instacens = [] 217 | count = 0 218 | while (count < opts.slaves): 219 | slave_instance_name = cluster_name + "-slave-%s" % (count) 220 | slave_instance_id = ecs.launch_instance(opts, cluster_name, "slaves", opts.ami, opts.instance_type, current_group_id, 221 | slave_instance_name, opts.ibo, slave_instance_name, 222 | opts.pwd, open_public_ip=opts.enable_slave_public_ip) 223 | slave_instacens.append(slave_instance_id) 224 | count += 1 225 | 226 | if not opts.include_gateway: 227 | # Launch master 228 | master_instance_name = cluster_name + "-master" 229 | master_instance_id = ecs.launch_instance(opts, cluster_name, "masters", opts.ami, opts.instance_type, current_group_id, 230 | master_instance_name, opts.ibo, master_instance_name, 231 | opts.pwd, open_public_ip=True) 232 | master_instances.append(master_instance_id) 233 | else: 234 | gateway = ecs.get_gateway_instance_info(opts)['InstanceId'] 235 | master_instances.append(gateway) 236 | save_masters_or_slaves(cluster_name, "masters", gateway) 237 | 238 | master_ip = ecs.get_instance_info(master_instances[0])['PublicIpAddress']['IpAddress'][0] 239 | 240 | return master_instances, slave_instacens, master_ip 241 | 242 | def wait_for_cluster_state(cluster_state, instances): 243 | sys.stdout.write("==> Waiting for cluster to enter one of `{s}` status .".format(s=cluster_state)) 244 | sys.stdout.flush() 245 | 246 | start_time = datetime.now() 247 | while True: 248 | time.sleep(5) 249 | 250 | all_instances_status = ecs.get_all_instances_status(instances) 251 | if all(status in cluster_state for status in all_instances_status): 252 | break 253 | 254 | sys.stdout.write(".") 255 | sys.stdout.flush() 256 | sys.stdout.write("\n") 257 | 258 | end_time = datetime.now() 259 | print "Cluster is now in one of '{s}' status. Waited {t} seconds.".format( 260 | s=cluster_state, 261 | t=(end_time - start_time).seconds 262 | ) 263 | 264 | def update_hosts(instance_id, opts, src, dst): 265 | src_file = src + "/" + GlobalVar.CLUSTER_HOSTS 266 | dst_file = dst + "/" + GlobalVar.CLUSTER_HOSTS 267 | append_hosts = "cat %s >> /etc/hosts" % dst_file 268 | remove_tmp_hosts = "rm -f %s" % dst_file 269 | do_scp(instance_id, opts, src_file, dst_file) 270 | do_ssh(instance_id, opts, append_hosts) 271 | do_ssh(instance_id, opts, remove_tmp_hosts) 272 | 273 | def do_scp(instance_id, opts, src, dst): 274 | instance_info = ecs.get_instance_info(instance_id) 275 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 276 | tries = 0 277 | while True: 278 | try: 279 | res = subprocess.check_call( 280 | ["sshpass", "-p", opts.pwd] + 281 | scp_command() + [src, '%s@%s:%s' % (opts.user, ip, dst)]) 282 | if res != 0: 283 | raise RuntimeError("Error executing remote command.") 284 | return res 285 | except subprocess.CalledProcessError as e: 286 | if tries > 5: 287 | # If this was an ssh failure, provide the user with hints. 288 | if e.returncode == 255: 289 | raise UsageError( 290 | "Failed to SSH to remote host {0}.\n".format(ip)) 291 | else: 292 | raise e 293 | print >> stderr, \ 294 | "Error executing remote command, retrying after 10 seconds." 295 | time.sleep(10) 296 | tries += 1 297 | 298 | def do_ssh(instance_id, opts, command): 299 | instance_info = ecs.get_instance_info(instance_id) 300 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 301 | tries = 0 302 | while True: 303 | try: 304 | res = subprocess.check_call( 305 | ["sshpass", "-p", opts.pwd] + 306 | ssh_command() + ['-t', '-t', '%s@%s' % (opts.user, ip), 307 | stringify_command(command)]) 308 | if res != 0: 309 | raise RuntimeError("Error executing remote command.") 310 | return res 311 | except subprocess.CalledProcessError as e: 312 | if tries > 5: 313 | # If this was an ssh failure, provide the user with hints. 314 | if e.returncode == 255: 315 | raise UsageError( 316 | "Failed to SSH to remote host {0}.\n".format(ip)) 317 | else: 318 | raise e 319 | print >> stderr, \ 320 | "Error executing remote command, retrying after 10 seconds." 321 | time.sleep(10) 322 | tries += 1 323 | 324 | def _check_output(*popenargs, **kwargs): 325 | if 'stdout' in kwargs: 326 | raise ValueError('stdout argument not allowed, it will be overridden.') 327 | process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) 328 | output, unused_err = process.communicate() 329 | retcode = process.poll() 330 | if retcode: 331 | cmd = kwargs.get("args") 332 | if cmd is None: 333 | cmd = popenargs[0] 334 | raise subprocess.CalledProcessError(retcode, cmd, output=output) 335 | return output 336 | 337 | def ssh_read(instance_id, opts, command): 338 | instance_info = ecs.get_instance_info(instance_id) 339 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 340 | return _check_output( 341 | ["sshpass", "-p", opts.pwd] + ssh_command() + ['%s@%s' % (opts.user, ip), stringify_command(command)]) 342 | 343 | def ssh_write(instance_id, opts, command, arguments): 344 | instance_info = ecs.get_instance_info(instance_id) 345 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 346 | tries = 0 347 | while True: 348 | proc = subprocess.Popen( 349 | ["sshpass", "-p", opts.pwd] + 350 | ssh_command() + ['%s@%s' % (opts.user, ip), stringify_command(command)], 351 | stdin=subprocess.PIPE) 352 | proc.stdin.write(arguments) 353 | proc.stdin.close() 354 | status = proc.wait() 355 | if status == 0: 356 | break 357 | elif tries > 5: 358 | raise RuntimeError("ssh_write failed with error %s" % proc.returncode) 359 | else: 360 | print >> stderr, \ 361 | "Error {0} while executing remote command, retrying after 10 seconds".format(status) 362 | time.sleep(10) 363 | tries = tries + 1 364 | 365 | def prepare_hosts(master, slaves, opts): 366 | cluster_hosts = open(GlobalVar.SPARK_ECS_DIR + "/" + GlobalVar.CLUSTER_HOSTS, 'w') 367 | instance_info = ecs.get_instance_info(master) 368 | host = instance_info['HostName'] 369 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 370 | cluster_hosts.write(ip + " " + host + "\n") 371 | 372 | for slave in slaves: 373 | instance_info = ecs.get_instance_info(slave) 374 | host = instance_info['HostName'] 375 | ip = instance_info['InnerIpAddress']['IpAddress'][0] 376 | cluster_hosts.write(ip + " " + host + "\n") 377 | 378 | cluster_hosts.close() 379 | update_hosts(master, opts, GlobalVar.SPARK_ECS_DIR, "/root/") 380 | for slave in slaves: 381 | update_hosts(slave, opts, GlobalVar.SPARK_ECS_DIR, "/root/") 382 | 383 | def mount_disk(masters, slaves, opts): 384 | print "==> mounting data disk: /dev/xvdb ..." 385 | src = "%s/sh/mount_disk.sh" % GlobalVar.SPARK_ECS_DIR 386 | dst = "/root/" 387 | command = "/bin/bash /root/mount_disk.sh > /dev/null 2>&1" 388 | for ins in masters + slaves: 389 | do_scp(ins, opts, src, dst) 390 | do_ssh(ins, opts, command) 391 | print "==> mounted OK..." 392 | 393 | # def update_default_output(opts): 394 | 395 | 396 | def open_nginx(opts,masters): 397 | print "==> Starting nginx service..." 398 | host_info_path = GlobalVar.CLUSTER_HOSTS 399 | master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0] 400 | result_code = start_nginx(opts, host_info_path, master_ip) 401 | if result_code == 1: 402 | print("[success] start nginx succcess ...") 403 | else: 404 | print("[error] start nginx failed ...") 405 | 406 | def stop_nginx(opts, masters): 407 | print "==> Stopping nginx service..." 408 | master_ip = ecs.get_instance_info(masters[0])['PublicIpAddress']['IpAddress'][0] 409 | result_code = do_stop_nginx(opts, master_ip) 410 | if result_code == 1: 411 | print("[success] stop nginx succcess ...") 412 | else: 413 | print("[error] stop nginx failed ...") 414 | 415 | def do_rollback(): 416 | print "==> Doing rollback..." 417 | # TODO: 418 | 419 | def welcome(): 420 | print """ 421 | Welcome to: 422 | ____ __ _____________ 423 | / __/__ ___ _____/ /__ ___ ____ /___/___//___/ 424 | _\ \/ _ \/ _ `/ __/ '_/ / _ \/__ / /___//___.\ \. 425 | /___/ .__/\_,_/_/ /_/\_\ \__./_/_/ /___/____/___/ 426 | /_/ version 0.1 427 | 428 | Type --help for more information. 429 | """ 430 | 431 | def print_shopping_list(opts): 432 | (cores, memory) = GlobalVar.ECS_INSTANCE_TYPE[opts.instance_type] 433 | if opts.disk_size is None: 434 | disk_size = "None" 435 | else: 436 | disk_size = "%sG" % opts.disk_size 437 | current_group_id = ecs.get_gateway_instance_info(opts)['SecurityGroupIds']['SecurityGroupId'][0] 438 | if opts.enable_slave_public_ip: 439 | slave_internet_charge_type = "PayByTraffic" 440 | slave_internet_bandwidth_out = opts.ibo 441 | else: 442 | slave_internet_charge_type = "PayByBandwidth" 443 | slave_internet_bandwidth_out = "0" 444 | 445 | print """The ECS instance configuration listed as following: 446 | 447 | +--------------------------------------------------------+ 448 | + Check List + 449 | +--------------------------------------------------------+""" 450 | if not opts.include_gateway: 451 | print """ 452 | Running Mode: %s 453 | 454 | Master Instance: 455 | Number: %s 456 | Region: %s 457 | Zone: %s 458 | Image: %s 459 | Cores: %s 460 | Memory: %sG 461 | Disk: %s 462 | InstanceType: %s 463 | SecurityGroup: %s 464 | InternetChargeType: %s 465 | InternetMaxBandwidthOut: %s 466 | """ % (opts.mode, "1", opts.region, opts.zone, opts.ami, cores, memory, 467 | disk_size, opts.instance_type, current_group_id, "PayByTraffic", opts.ibo) 468 | print """ 469 | Slave Instance: 470 | Number: %s 471 | Region: %s 472 | Zone: %s 473 | Image: %s 474 | Cores: %s 475 | Memory: %sG 476 | Disk: %s 477 | InstanceType: %s 478 | SecurityGroup: %s 479 | InternetChargeType: %s 480 | InternetMaxBandwidthOut: %s 481 | +--------------------------------------------------------+ 482 | """ % (opts.slaves, opts.region, opts.zone, opts.ami, cores, memory, opts.instance_type, 483 | disk_size, current_group_id, slave_internet_charge_type, slave_internet_bandwidth_out) 484 | msg = "Continue buying? (Y/n): " 485 | to_buy = raw_input(msg) 486 | if to_buy != "Y": 487 | print "Not `Y`, give up buying ECS instances, Goodbye!" 488 | sys.exit(1) 489 | 490 | def end_of_startup(opts, master_ip, masters): 491 | master_name = ecs.get_instance_info(masters[0])['HostName'] 492 | print """ 493 | +--------------------------------------------------------+ 494 | + Spark Cluster Started Successfully! + 495 | +--------------------------------------------------------+ 496 | The Spark Cluster Configuration listed as following: 497 | 498 | Spark Cluster: 499 | 500 | Master Node IP: %s 501 | Spark UI: http://%s:8080 502 | Master URL: spark://%s:7077 503 | 504 | """ % (master_ip, master_ip, master_name) 505 | 506 | if opts.enable_hdfs: 507 | print """ 508 | HDFS NameNode URL: hdfs://%s:9000 509 | """ % master_ip 510 | 511 | if opts.enable_spark_notebook: 512 | print """ 513 | Spark Notebook: http://%s:9090 514 | """ % master_ip 515 | if opts.enable_hue: 516 | print """ 517 | Hue: http://%s:8888 518 | """ % master_ip 519 | print""" 520 | +--------------------------------------------------------+ 521 | """ 522 | 523 | def warning(): 524 | print """ 525 | ********************************************************** 526 | ** WARNING!!! ** 527 | ********************************************************** 528 | """ 529 | --------------------------------------------------------------------------------