├── ansible ├── .gitignore ├── nginx.sh ├── inventory │ ├── local │ ├── ec2.ini │ └── hosts ├── spark-jars.sh ├── sparknotebook.yml ├── sparknotebook-prov.yml ├── sparknotebook.sh ├── sparknotebook └── nginx.conf ├── .travis.yml ├── sbt └── sbt │ ├── bin │ ├── sbt-launch.jar │ ├── sbt.bat │ ├── sbt │ └── sbt-launch-lib.bash │ └── conf │ ├── sbtconfig.txt │ └── sbtopts ├── src ├── test │ ├── resources │ │ ├── FuncTestSparkNotebookContextFile2.csv │ │ ├── FuncTestSparkNotebookContextEmpty.csv │ │ └── FuncTestSparkNotebookContextFile1.csv │ └── scala │ │ └── eleflow │ │ └── sparknotebook │ │ ├── BeforeAndAfterWithContext.scala │ │ └── FuncTestSparkNotebookContext.scala ├── main │ ├── resources │ │ └── log4j.properties │ └── scala │ │ └── eleflow │ │ └── sparknotebook │ │ ├── exception │ │ ├── UnexpectedValueException.scala │ │ ├── InvalidDataException.scala │ │ └── UnexpectedFileFormatException.scala │ │ ├── enums │ │ ├── DataSetType.scala │ │ ├── PeriodOfDay.scala │ │ └── DateSplitType.scala │ │ ├── util │ │ ├── SparkNotebookConfig.scala │ │ ├── IntStringImplicitTypeConverter.scala │ │ └── DateTimeParser.scala │ │ ├── Main.scala │ │ ├── visualization │ │ └── RichDisplay.scala │ │ ├── SparkNotebookInterpreter.scala │ │ ├── data │ │ ├── DataTransformer.scala │ │ └── Dataset.scala │ │ └── SparkNotebookContext.scala ├── universal │ └── ec2 │ │ ├── deploy.generic │ │ └── root │ │ │ └── spark-ec2 │ │ │ └── ec2-variables.sh │ │ └── spark_ec2.py └── templates │ └── bash-template ├── .gitignore ├── aws.deploy.Dockerfile ├── nodocker.md ├── project ├── build.properties └── plugins.sbt ├── aws.deploy.sh ├── README.md └── LICENSE /ansible/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | logs 3 | -------------------------------------------------------------------------------- /ansible/nginx.sh: -------------------------------------------------------------------------------- 1 | sudo yum -y install nginx -------------------------------------------------------------------------------- /ansible/inventory/local: -------------------------------------------------------------------------------- 1 | [local] 2 | localhost 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | scala: 4 | - 2.10.4 5 | 6 | script: "sbt clean scoverage:test" -------------------------------------------------------------------------------- /sbt/sbt/bin/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eleflow/sparknotebook/HEAD/sbt/sbt/bin/sbt-launch.jar -------------------------------------------------------------------------------- /src/test/resources/FuncTestSparkNotebookContextFile2.csv: -------------------------------------------------------------------------------- 1 | int,string2,double 2 | 5,vlr1,10.5 3 | 1,vl3,0.1 4 | 8,vlr1,10.0 -------------------------------------------------------------------------------- /src/test/resources/FuncTestSparkNotebookContextEmpty.csv: -------------------------------------------------------------------------------- 1 | id,int,string2,double 2 | 1,5,vlr1,10.5 3 | 2,1,vl3,0.1 4 | 3,8,,10.0 -------------------------------------------------------------------------------- /src/test/resources/FuncTestSparkNotebookContextFile1.csv: -------------------------------------------------------------------------------- 1 | id,int,string2,double 2 | 1,5,vlr1,10.5 3 | 2,1,vl3,0.1 4 | 3,8,vlr1,10.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | metastore_db 2 | derby.log 3 | target 4 | project/project/target 5 | project/target 6 | aws.deploy.env 7 | logs/ 8 | *.pem 9 | -------------------------------------------------------------------------------- /sbt/sbt/conf/sbtconfig.txt: -------------------------------------------------------------------------------- 1 | # Set the java args to high 2 | 3 | -Xmx512M 4 | 5 | -XX:MaxPermSize=256m 6 | 7 | -XX:ReservedCodeCacheSize=128m 8 | 9 | 10 | 11 | # Set the extra SBT options 12 | 13 | -Dsbt.log.format=true 14 | 15 | -------------------------------------------------------------------------------- /ansible/spark-jars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget -O /tmp/spark-1.2.0-bin-cdh4.tgz https://s3-us-west-2.amazonaws.com/sparknotebook-public/spark/spark-1.2.0-bin-cdh4.tgz 3 | tar -xzf /tmp/spark-1.2.0-bin-cdh4.tgz --strip-components 2 --wildcards --no-anchored 'spark-assembly*.jar' 4 | tar -xzf /tmp/spark-1.2.0-bin-cdh4.tgz --strip-components 2 --wildcards --no-anchored 'datanucleus*.jar' 5 | mkdir -p /opt/spark 6 | mkdir -p /opt/spark/lib 7 | cp spark-assembly*.jar /opt/spark/lib 8 | cp datanucleus*.jar /opt/spark/lib 9 | -------------------------------------------------------------------------------- /aws.deploy.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | MAINTAINER Paulo Magalhaes 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | python \ 6 | python-pip \ 7 | software-properties-common \ 8 | && apt-add-repository ppa:ansible/ansible \ 9 | && apt-get update && apt-get install -y ansible 10 | 11 | RUN pip install awscli 12 | RUN pip install boto 13 | ADD aws.deploy.env /tmp/aws.deploy.env 14 | RUN . /tmp/aws.deploy.env && printf "[defaults]\nprivate_key_file=/.ssh/${AWS_KEY_PAIR}.pem\nhost_key_checking=False" > ~/.ansible.cfg 15 | 16 | ENTRYPOINT /sparknotebook/aws.deploy.sh 17 | -------------------------------------------------------------------------------- /nodocker.md: -------------------------------------------------------------------------------- 1 | # Setup without Docker 2 | 3 | If you want to use Sparknotebook without docker you have to: 4 | 1. [To install ansible](http://docs.ansible.com/intro_installation.html#installing-the-control-machine) 5 | 1. [To install boto](http://boto.readthedocs.org/en/latest/getting_started.html#installing-boto) 6 | 1. [To configure aws credentials in boto](http://boto.readthedocs.org/en/latest/getting_started.html#configuring-boto-credentials) 7 | 1. [Create a AWS IAM role](http://docs.aws.amazon.com/IAM/latest/UserGuide/roles-creatingrole-service.html) named **dev-ops** 8 | with the policies below: 9 | 10 | ```JSON 11 | { 12 | "Version": "2012-10-17", 13 | "Statement": [ 14 | { 15 | "Action": "ec2:*", 16 | "Effect": "Allow", 17 | "Resource": "*" 18 | } 19 | ] 20 | } 21 | ``` 22 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootLogger=INFO, file 3 | 4 | # Direct log messages to a log file 5 | log4j.appender.file=org.apache.log4j.RollingFileAppender 6 | 7 | #Redirect to Tomcat logs folder 8 | #log4j.appender.file.File=${catalina.home}/logs/logging.log 9 | 10 | log4j.appender.file.File=logs/logging.log 11 | log4j.appender.file.MaxFileSize=10MB 12 | log4j.appender.file.MaxBackupIndex=10 13 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 14 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 15 | 16 | # Settings to quiet third party logs that are too verbose 17 | log4j.logger.org.eclipse.jetty=INFO 18 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=INFO 19 | log4j.logger.org.apache.spark.repl=INFO 20 | 21 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one or more 2 | // contributor license agreements. See the NOTICE file distributed with 3 | // this work for additional information regarding copyright ownership. 4 | // The ASF licenses this file to You under the Apache License, Version 2.0 5 | // (the "License"); you may not use this file except in compliance with 6 | // the License. You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | sbt.version=0.13.7 17 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/exception/UnexpectedValueException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.exception 17 | 18 | /** 19 | * Created by dirceu on 16/12/14. 20 | */ 21 | class UnexpectedValueException(message:String) extends Exception(message) 22 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/enums/DataSetType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.enums 17 | 18 | /** 19 | * Created by dirceu on 12/12/14. 20 | */ 21 | object DataSetType extends Enumeration{ 22 | type Types = Value 23 | val Train,Test = Value 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/enums/PeriodOfDay.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.enums 17 | 18 | /** 19 | * Created by dirceu on 20/02/15. 20 | */ 21 | object PeriodOfDay extends Enumeration { 22 | type PeriodOfDay = Value 23 | val Morning, Afternoon, Evening, Dawn = Value 24 | } -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/exception/InvalidDataException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.exception 17 | 18 | import scala.util.control.NoStackTrace 19 | 20 | /** 21 | * Created by dirceu on 04/11/14. 22 | */ 23 | class InvalidDataException (message: String) extends Exception(message) with NoStackTrace { 24 | def this() = this("") 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/exception/UnexpectedFileFormatException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.exception 17 | 18 | import scala.util.control.NoStackTrace 19 | 20 | /** 21 | * Created by dirceu on 15/10/14. 22 | */ 23 | class UnexpectedFileFormatException(message: String) extends Exception(message) with NoStackTrace { 24 | def this() = this("") 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/util/SparkNotebookConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.util 17 | /** 18 | * Created by dirceu on 05/12/14. 19 | */ 20 | object SparkNotebookConfig { 21 | val dateFormatFileName = "spark.properties" 22 | val propertyFolder = s"sparknotebook-${DateTimeParser.hashCode()}" 23 | val tempFolder = System.getProperty("java.io.tmpdir") 24 | } -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/enums/DateSplitType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.enums 17 | 18 | /** 19 | * Created by dirceu on 02/12/14. 20 | */ 21 | object DateSplitType { 22 | def contains(compare:Long,compareTo:Long) = (compare & compareTo) == compareTo 23 | val NoSplit = 1 << 0 24 | val Period = 1 << 1 25 | val DayOfAWeek = 1 << 2 26 | val WorkNonWorkingDay = 1 << 3 27 | val PeriodDayOfAWeek = Period | DayOfAWeek 28 | } 29 | -------------------------------------------------------------------------------- /sbt/sbt/conf/sbtopts: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------ # 2 | # The SBT Configuration file. # 3 | # ------------------------------------------------ # 4 | 5 | 6 | # Disable ANSI color codes 7 | # 8 | #-no-colors 9 | 10 | # Starts sbt even if the current directory contains no sbt project. 11 | # 12 | -sbt-create 13 | 14 | # Path to global settings/plugins directory (default: ~/.sbt) 15 | # 16 | #-sbt-dir /etc/sbt 17 | 18 | # Path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 19 | # 20 | #-sbt-boot ~/.sbt/boot 21 | 22 | # Path to local Ivy repository (default: ~/.ivy2) 23 | # 24 | #-ivy ~/.ivy2 25 | 26 | # set memory options 27 | # 28 | #-mem 29 | 30 | # Use local caches for projects, no sharing. 31 | # 32 | #-no-share 33 | 34 | # Put SBT in offline mode. 35 | # 36 | #-offline 37 | 38 | # Sets the SBT version to use. 39 | #-sbt-version 0.11.3 40 | 41 | # Scala version (default: latest release) 42 | # 43 | #-scala-home 44 | #-scala-version 45 | 46 | # java version (default: java from PATH, currently $(java -version |& grep version)) 47 | # 48 | #-java-home 49 | 50 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/util/IntStringImplicitTypeConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.util 17 | 18 | import com.gensler.scalavro.util.Union 19 | import com.gensler.scalavro.util.Union.union 20 | /** 21 | * Created by dirceu on 03/11/14. 22 | */ 23 | object IntStringImplicitTypeConverter { 24 | 25 | type IS = union[Int]#or[String] 26 | implicit def convIntToUnion(i:Int): Union[IS] = { 27 | val union = new Union[IS] 28 | union.assign(i) 29 | union 30 | } 31 | implicit def convStringToUnion(i:String): Union[IS] = { 32 | val union = new Union[IS] 33 | union.assign(i) 34 | union 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one or more 2 | // contributor license agreements. See the NOTICE file distributed with 3 | // this work for additional information regarding copyright ownership. 4 | // The ASF licenses this file to You under the Apache License, Version 2.0 5 | // (the "License"); you may not use this file except in compliance with 6 | // the License. You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | resolvers ++= Seq( 17 | "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", 18 | "Sonatype OSS Snapshots Repository" at "http://oss.sonatype.org/content/groups/public" 19 | ) 20 | 21 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "0.7.4") 22 | 23 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2") 24 | 25 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 26 | 27 | addSbtPlugin("org.scoverage" %% "sbt-scoverage" % "0.99.7.1") -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/Main.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook 17 | 18 | import sun.misc.{Signal,SignalHandler} 19 | 20 | import org.zeromq.ZMQ 21 | 22 | import scalax.io.JavaConverters._ 23 | import scalax.file.Path 24 | 25 | import org.refptr.iscala._ 26 | import json.JsonUtil._ 27 | import msg._ 28 | 29 | object Main extends App { 30 | val options = new Options(args) 31 | 32 | val thread = new Thread { 33 | override def run() { 34 | val iscala = new IScala(options.config){ 35 | override lazy val interpreter = new SparkNotebookInterpreter(classpath, options.config.args) 36 | } 37 | iscala.heartBeat.join() 38 | } 39 | } 40 | 41 | thread.setName("IScala") 42 | thread.setDaemon(true) 43 | thread.start() 44 | thread.join() 45 | } 46 | 47 | 48 | -------------------------------------------------------------------------------- /ansible/sparknotebook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: tag_Name_sparknotebook 3 | gather_facts: True 4 | user: ec2-user 5 | sudo: True 6 | tasks: 7 | - name: install spark jars 8 | script: spark-jars.sh 9 | - debug: var=script.stdout 10 | - debug: var=script.stderr 11 | - name: copy sparknotebook to init.d 12 | copy: src=sparknotebook dest=/etc/init.d/sparknotebook 13 | - name: copy local sparknotebook 14 | copy: src=../target/universal/sparknotebook-0.1.0-SNAPSHOT.zip dest=/tmp/sparknotebook-0.1.0-SNAPTSHOT.zip 15 | ignore_errors: yes 16 | - name: ipython notebook and sparknotebook 17 | script: sparknotebook.sh 18 | - debug: var=script.stdout 19 | - debug: var=script.stderr 20 | - name: start sparknotebook 21 | service: name=sparknotebook state=restarted enabled=yes 22 | - debug: var=script.stdout 23 | - debug: var=script.stderr 24 | - name: install nginx 25 | script: nginx.sh 26 | tags: 27 | - nginx 28 | - debug: var=script.stdout 29 | - debug: var=script.stderr 30 | - name: copy nginx conf 31 | copy: src=nginx.conf dest=/etc/nginx/nginx.conf 32 | tags: 33 | - nginx 34 | - debug: var=script.stdout 35 | - debug: var=script.stderr 36 | - name: nginx on startup 37 | service: name=nginx state=started enabled=on 38 | tags: 39 | - nginx 40 | - nginx-conf 41 | - debug: var=script.stdout 42 | - debug: var=script.stderr 43 | 44 | -------------------------------------------------------------------------------- /sbt/sbt/bin/sbt.bat: -------------------------------------------------------------------------------- 1 | @REM SBT launcher script 2 | @REM 3 | @REM Envioronment: 4 | @REM JAVA_HOME - location of a JDK home dir (mandatory) 5 | @REM SBT_OPTS - JVM options (optional) 6 | @REM Configuration: 7 | @REM sbtconfig.txt found in the SBT_HOME. 8 | 9 | @REM ZOMG! We need delayed expansion to build up CFG_OPTS later 10 | @setlocal enabledelayedexpansion 11 | 12 | @echo off 13 | set SBT_HOME=%~dp0 14 | 15 | rem FIRST we load the config file of extra options. 16 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt 17 | set CFG_OPTS= 18 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( 19 | set DO_NOT_REUSE_ME=%%i 20 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 21 | rem CFG_OPTS, otherwise it remains "" for this loop. 22 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 23 | ) 24 | 25 | rem We use the value of the JAVACMD environment variable if defined 26 | set _JAVACMD=%JAVACMD% 27 | 28 | if "%_JAVACMD%"=="" ( 29 | if not "%JAVA_HOME%"=="" ( 30 | if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" 31 | ) 32 | ) 33 | 34 | if "%_JAVACMD%"=="" set _JAVACMD=java 35 | 36 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. 37 | set _JAVA_OPTS=%JAVA_OPTS% 38 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% 39 | 40 | :run 41 | 42 | "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* 43 | if ERRORLEVEL 1 goto error 44 | goto end 45 | 46 | :error 47 | @endlocal 48 | exit /B 1 49 | 50 | 51 | :end 52 | @endlocal 53 | exit /B 0 54 | -------------------------------------------------------------------------------- /src/universal/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # These variables are automatically filled in by the spark-ec2 script. 21 | export MASTERS="{{master_list}}" 22 | export SLAVES="{{slave_list}}" 23 | export HDFS_DATA_DIRS="{{hdfs_data_dirs}}" 24 | export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" 25 | export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" 26 | export MODULES="{{modules}}" 27 | export SPARK_VERSION="{{spark_version}}" 28 | export SHARK_VERSION="{{shark_version}}" 29 | export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" 30 | export SWAP_MB="{{swap}}" 31 | export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" 32 | export SPARK_MASTER_OPTS="{{spark_master_opts}}" 33 | export METASTORE_USER="{{metastore_user}}" 34 | export METASTORE_PASSWD="{{metastore_passwd}}" -------------------------------------------------------------------------------- /aws.deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | create-profile(){ 3 | h=`aws iam list-roles|grep RoleName |grep dev-ops |cut -d '"' -f 4` 4 | if [ -z $h ]; then 5 | echo "creating dev-ops role" 6 | aws iam create-instance-profile --instance-profile-name dev-ops 7 | aws iam create-role --role-name dev-ops --assume-role-policy-document '{ "Version": "2012-10-17", "Statement": [ { "Action": "sts:AssumeRole", "Principal": { "Service": "ec2.amazonaws.com" }, "Effect": "Allow", "Sid": "" } ] }' 8 | aws iam add-role-to-instance-profile --instance-profile-name dev-ops --role-name dev-ops 9 | aws iam put-role-policy --role-name dev-ops --policy-name 'AllowEc2forDevOps' --policy-document '{ "Version": "2012-10-17", "Statement": [ { "Action": "ec2:*", "Effect": "Allow", "Resource": "*" } ] }' 10 | aws iam put-role-policy --role-name dev-ops --policy-name 'AllowS3forDevOps' --policy-document '{ "Version": "2012-10-17", "Statement": [ { "Action": "s3:*", "Effect": "Allow", "Resource": "*" } ] }' 11 | aws iam put-role-policy --role-name dev-ops --policy-name 'AllowPassRoleforDevOps' --policy-document '{ "Version": "2012-10-17", "Statement": [ { "Sid": "Stmt1409776891000", "Effect": "Allow", "Action": [ "iam:PassRole" ], "Resource": [ "*" ] } ] }' 12 | else 13 | echo "dev-ops role already exits" 14 | fi 15 | } 16 | create-profile 17 | cd /sparknotebook/ansible 18 | ansible-playbook -vvvv -i inventory/local --extra-vars "keypair=$AWS_KEY_PAIR" sparknotebook-prov.yml 19 | ansible-playbook -vvvv -i inventory/hosts --extra-vars "keypair=$AWS_KEY_PAIR" sparknotebook.yml -------------------------------------------------------------------------------- /src/test/scala/eleflow/sparknotebook/BeforeAndAfterWithContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook 17 | 18 | import org.apache.log4j.{Level, Logger} 19 | import org.apache.spark.SparkConf 20 | import org.scalatest.{BeforeAndAfterEach, Suite} 21 | 22 | object TestSparkConf { 23 | @transient lazy val conf = { 24 | val sconf = new SparkConf() 25 | sconf.set("spark.app.name", "teste") 26 | sconf 27 | } 28 | 29 | val separator ="," 30 | 31 | } 32 | 33 | /** 34 | * Created by dirceu on 22/10/14. 35 | */ 36 | trait BeforeAndAfterWithContext extends BeforeAndAfterEach { 37 | this: Suite => 38 | 39 | val defaultFilePath = "src/test/resources/" 40 | import eleflow.sparknotebook.TestSparkConf._ 41 | ClusterSettings.master=Some("local[*]") 42 | conf.set("spark.driver.allowMultipleContexts","true") 43 | val context = new SparkNotebookContext(conf) 44 | 45 | override def beforeEach() = { 46 | setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka")) 47 | } 48 | 49 | def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = { 50 | loggers.map { 51 | loggerName => 52 | val logger = Logger.getLogger(loggerName) 53 | val prevLevel = logger.getLevel() 54 | logger.setLevel(level) 55 | loggerName -> prevLevel 56 | }.toMap 57 | } 58 | 59 | override def afterEach() = { 60 | context.clearContext 61 | System.clearProperty("spark.master.port") 62 | } 63 | } 64 | 65 | -------------------------------------------------------------------------------- /ansible/sparknotebook-prov.yml: -------------------------------------------------------------------------------- 1 | - hosts: localhost 2 | connection: local 3 | gather_facts: False 4 | vars: 5 | keypair: "objeleflow" 6 | instance_type: "r3.xlarge" 7 | price: "0.15" 8 | image: "ami-d13845e1" 9 | group: "SparkNotebookApplication" 10 | region: "us-west-2" 11 | zone: "us-west-2b" 12 | iamrole: "dev-ops" 13 | tasks: 14 | - name: create sparknotebook security group 15 | ec2_group: 16 | name: "{{ group }}" 17 | description: Security Group for the Web app 18 | region: "{{ region }}" 19 | purge_rules: false 20 | purge_rules_egress: false 21 | rules: 22 | - proto: tcp 23 | from_port: 80 24 | to_port: 80 25 | cidr_ip: 0.0.0.0/0 26 | - proto: tcp 27 | from_port: 22 28 | to_port: 22 29 | cidr_ip: 0.0.0.0/0 30 | - proto: tcp 31 | from_port: 4040 32 | to_port: 4040 33 | cidr_ip: 0.0.0.0/0 34 | - proto: tcp 35 | from_port: 8080 36 | to_port: 8080 37 | cidr_ip: 0.0.0.0/0 38 | rules_egress: 39 | - proto: all 40 | cidr_ip: 0.0.0.0/0 41 | 42 | - name: create sparknotebook instance 43 | ec2: image={{ image }} 44 | instance_type={{ instance_type }} 45 | keypair={{ keypair }} 46 | instance_tags='{"Name":"sparknotebook"}' 47 | instance_profile_name={{ iamrole }} 48 | region={{ region }} 49 | zone={{ zone }} 50 | group={{ group }} 51 | 52 | wait=true 53 | #spot_price={{price}} 54 | register: ec2_info 55 | - debug: var=script.stdout 56 | - debug: var=script.stderr 57 | # # vpc_subnet_id=subnet-e32aff86 58 | # assign_public_ip=yes 59 | # - add_host: hostname={{ item.public_ip }} groupname=ec2hosts 60 | # with_items: ec2_info.instances 61 | - name: wait for instances to listen on port:22 62 | wait_for: 63 | state=started 64 | host={{ item.public_dns_name }} 65 | port=22 66 | timeout=600 67 | with_items: ec2_info.instances 68 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/visualization/RichDisplay.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.visualization 17 | 18 | import eleflow.sparknotebook.data.{FileDataset, Dataset} 19 | import org.apache.spark.sql.SchemaRDD 20 | import org.refptr.iscala.display.HTMLDisplay 21 | 22 | import scalatags.Text.TypedTag 23 | import scalatags.Text.all._ 24 | import Dataset._ 25 | 26 | 27 | /** 28 | * SparkNotebook 29 | * Copyright (C) 2014 eleflow. 30 | * User: paulomagalhaes 31 | * Date: 10/27/14 10:24 AM 32 | */ 33 | object RichDisplay { 34 | 35 | 36 | 37 | implicit val HTMLTypedTag = HTMLDisplay[TypedTag[String]](_.toString) 38 | implicit val HTMLSchemaRdd = HTMLDisplay[SchemaRDD, TypedTag[String]] { rdd:SchemaRDD => 39 | 40 | div(style:="overflow:scroll", table( 41 | tr( 42 | rdd.schema.fieldNames.map(column=>th(column)) 43 | ), 44 | rdd.take(7).map(row=> 45 | tr(row.map(field=> td(String.valueOf(field))) 46 | )) 47 | )) 48 | } 49 | 50 | 51 | implicit val HTMLSeqAny = HTMLDisplay[Seq[Any], TypedTag[String]] { seq => 52 | div(style:="overflow:scroll", table( 53 | seq.zipWithIndex.map(row=> 54 | tr(th(row._2), td(String.valueOf(row._1)))) 55 | )) 56 | } 57 | 58 | implicit val HTMLSeqTuples = HTMLDisplay[Seq[Product], TypedTag[String]] { seq => 59 | div(style:="overflow:scroll", table( 60 | seq.zipWithIndex.map(row=> 61 | tr(th(row._2), row._1.productIterator.toList.map(field=> td(String.valueOf(field))) 62 | )) 63 | )) 64 | } 65 | 66 | 67 | implicit val HTMLMapAny = HTMLDisplay[Map[Any, Any], TypedTag[String]] { aMap => 68 | 69 | div(style:="overflow:scroll", table( 70 | aMap.map(entry=> 71 | tr(td(String.valueOf(entry._1)), td(String.valueOf(entry._2)) 72 | )).toSeq 73 | )) 74 | } 75 | 76 | implicit val HTMLArrayTuples = HTMLDisplay[Array[Product], Seq[Product]] { array => 77 | array.toSeq 78 | } 79 | 80 | implicit val HTMLDataset = HTMLDisplay[Dataset, SchemaRDD] { dataset => 81 | dataset 82 | } 83 | 84 | implicit val HTMLFileDataset = HTMLDisplay[FileDataset, SchemaRDD] { dataset => 85 | dataset 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /ansible/sparknotebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | version="0.1.0" 3 | yum -y update 4 | 5 | yum -y groupinstall "Development Tools" 6 | 7 | #install python 2.7 8 | yum -y install python27-devel.x86_64 9 | wget https://bootstrap.pypa.io/get-pip.py 10 | python27 ./get-pip.py 11 | # install zeroMQ 12 | wget http://download.zeromq.org/zeromq-4.0.4.tar.gz 13 | tar xzvf zeromq-4.0.4.tar.gz 14 | cd zeromq-4.0.4 15 | ./config 16 | make install 17 | #install ipython 18 | /usr/local/bin/pip2.7 install "ipython[all]"==2.4 19 | 20 | 21 | createSparknotebookprofile(){ 22 | sudo -u sparknotebook /usr/local/bin/ipython profile create sparknotebook 23 | rm -f ~sparknotebook/.ipython/profile_sparknotebook/ipython_config.py 24 | sudo -u sparknotebook cat << EOF >> ~sparknotebook/.ipython/profile_sparknotebook/ipython_config.py 25 | # Configuration file for ipython. 26 | 27 | c = get_config() 28 | 29 | c.KernelManager.kernel_cmd = ["/usr/share/sparknotebook/bin/sparknotebook", 30 | #"-mem","28000", 31 | "--profile", "{connection_file}", 32 | "--parent"] 33 | c.NotebookApp.ip = "*" # only add this line if you want IPython-notebook being open to the public 34 | c.NotebookApp.open_browser = False # only add this line if you want to suppress opening a browser after IPython-notebook initialization 35 | EOF 36 | } 37 | 38 | 39 | # Adding system user/group : sparknotebook and sparknotebook 40 | if ! getent group | grep -q "^sparknotebook:" ; 41 | then 42 | echo "Creating system group: sparknotebook" 43 | groupadd sparknotebook 44 | fi 45 | if ! getent passwd | grep -q "^sparknotebook:"; 46 | then 47 | echo "Creating system user: sparknotebook" 48 | useradd --gid sparknotebook --create-home --comment "SparkNotebook Interactive User" sparknotebook 49 | fi 50 | 51 | # install Spark Notebook 52 | file="/tmp/sparknotebook-$version.zip" 53 | if ! test -f "$file" 54 | then 55 | aws s3 cp s3://sparknotebook-public/sparknotebook/sparknotebook-$version.zip /tmp/ 56 | fi 57 | 58 | unzip -o /tmp/sparknotebook-$version.zip -d /usr/share 59 | rm -f /tmp/sparknotebook-$version.zip 60 | rm -f /usr/share/sparknotebook 61 | ln -s /usr/share/sparknotebook-$version /usr/share/sparknotebook 62 | 63 | chown -R sparknotebook:sparknotebook /usr/share/sparknotebook-$version 64 | chown sparknotebook:sparknotebook /usr/share/sparknotebook 65 | 66 | #install ipython init.d scripts 67 | mkdir -p /files 68 | chown sparknotebook:sparknotebook /files 69 | mkdir -p /var/log/sparknotebook 70 | chown sparknotebook:sparknotebook /var/log/sparknotebook 71 | mkdir -p /etc/default/sparknotebook 72 | chown sparknotebook:sparknotebook /etc/default/sparknotebook 73 | mkdir -p /var/run/sparknotebook 74 | chown sparknotebook:sparknotebook /var/run/sparknotebook 75 | sudo -u sparknotebook ipython profile create sparknotebook 76 | 77 | createSparknotebookprofile 78 | chmod +x /etc/init.d/sparknotebook 79 | -------------------------------------------------------------------------------- /ansible/inventory/ec2.ini: -------------------------------------------------------------------------------- 1 | # Ansible EC2 external inventory script settings 2 | # 3 | 4 | [ec2] 5 | 6 | # to talk to a private eucalyptus instance uncomment these lines 7 | # and edit edit eucalyptus_host to be the host name of your cloud controller 8 | #eucalyptus = True 9 | #eucalyptus_host = clc.cloud.domain.org 10 | 11 | # AWS regions to make calls to. Set this to 'all' to make request to all regions 12 | # in AWS and merge the results together. Alternatively, set this to a comma 13 | # separated list of regions. E.g. 'us-east-1,us-west-1,us-west-2' 14 | regions = us-west-2,sa-east-1 15 | regions_exclude = us-gov-west-1,cn-north-1 16 | 17 | # When generating inventory, Ansible needs to know how to address a server. 18 | # Each EC2 instance has a lot of variables associated with it. Here is the list: 19 | # http://docs.pythonboto.org/en/latest/ref/ec2.html#module-boto.ec2.instance 20 | # Below are 2 variables that are used as the address of a server: 21 | # - destination_variable 22 | # - vpc_destination_variable 23 | 24 | # This is the normal destination variable to use. If you are running Ansible 25 | # from outside EC2, then 'public_dns_name' makes the most sense. If you are 26 | # running Ansible from within EC2, then perhaps you want to use the internal 27 | # address, and should set this to 'private_dns_name'. 28 | destination_variable = public_dns_name 29 | 30 | # For server inside a VPC, using DNS names may not make sense. When an instance 31 | # has 'subnet_id' set, this variable is used. If the subnet is public, setting 32 | # this to 'ip_address' will return the public IP address. For instances in a 33 | # private subnet, this should be set to 'private_ip_address', and Ansible must 34 | # be run from with EC2. 35 | vpc_destination_variable = ip_address 36 | 37 | # To tag instances on EC2 with the resource records that point to them from 38 | # Route53, uncomment and set 'route53' to True. 39 | route53 = False 40 | 41 | # Additionally, you can specify the list of zones to exclude looking up in 42 | # 'route53_excluded_zones' as a comma-separated list. 43 | # route53_excluded_zones = samplezone1.com, samplezone2.com 44 | 45 | # By default, only EC2 instances in the 'running' state are returned. Set 46 | # 'all_instances' to True to return all instances regardless of state. 47 | all_instances = False 48 | 49 | # By default, only RDS instances in the 'available' state are returned. Set 50 | # 'all_rds_instances' to True return all RDS instances regardless of state. 51 | all_rds_instances = False 52 | 53 | # API calls to EC2 are slow. For this reason, we cache the results of an API 54 | # call. Set this to the path you want cache files to be written to. Two files 55 | # will be written to this directory: 56 | # - ansible-ec2.cache 57 | # - ansible-ec2.index 58 | cache_path = ~/.ansible/tmp 59 | 60 | # The number of seconds a cache file is considered valid. After this many 61 | # seconds, a new API call will be made, and the cache file will be updated. 62 | # To disable the cache, set this value to 0 63 | cache_max_age = 300 64 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/SparkNotebookInterpreter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook 17 | 18 | import org.apache.spark.SparkConf 19 | import org.apache.spark.repl.SparkILoop 20 | 21 | import scala.collection.immutable 22 | import scala.tools.nsc.interpreter.{IR, NamedParam} 23 | import org.apache.spark.repl.SparkILoop 24 | import org.apache.spark.{SparkConf} 25 | import org.refptr.iscala.{Results, Interpreter} 26 | 27 | 28 | class SparkNotebookInterpreter(classpath: String, args: Seq[String], usejavacp: Boolean=true) extends Interpreter(classpath, args, false, usejavacp) { 29 | 30 | var snc: SparkNotebookContext = _ 31 | 32 | override def initializeSpark() { 33 | snc = createContext() 34 | 35 | val namedParam = NamedParam[SparkNotebookContext]("snc", snc) 36 | intp.beQuietDuring(bind(namedParam.name, namedParam.tpe, namedParam.value, immutable.List("@transient"))) match { 37 | case IR.Success => Unit 38 | case _ => throw new RuntimeException("Spark failed to initialize") 39 | } 40 | 41 | val importSVresult = interpret( """ 42 | import org.apache.spark.SparkContext._ 43 | import eleflow.sparknotebook._ 44 | import eleflow.sparknotebook.visualization.RichDisplay._ 45 | import snc._ 46 | import eleflow.sparknotebook.data.Dataset._ 47 | """) 48 | importSVresult match { 49 | case Results.Value(value, tpe, repr) => Unit 50 | case Results.NoValue => Unit 51 | case Results.Exception(_,_,_,ee) => throw new RuntimeException("SparkContext failed to be imported", ee) 52 | case _ => throw new RuntimeException("SparkContext failed to be imported") 53 | } 54 | 55 | } 56 | 57 | override def sparkCleanUp() { 58 | if (snc!=null) { 59 | snc.clearContext 60 | } 61 | } 62 | 63 | override lazy val appName: String = "SparkNotebook" 64 | 65 | def createContext(): SparkNotebookContext = { 66 | val execUri = System.getenv("SPARK_EXECUTOR_URI") 67 | val jars = SparkILoop.getAddedJars 68 | val conf = new SparkConf() 69 | .setMaster(getMaster()) 70 | .setAppName(this.appName) 71 | .setJars(jars) 72 | .set("spark.repl.class.uri", intp.classServer.uri) //very important! spark treat REPL very differently 73 | .set("spark.files.overwrite","true") 74 | if (execUri != null) { 75 | conf.set("spark.executor.uri", execUri) 76 | } 77 | if (System.getenv("SPARK_HOME") != null) { 78 | conf.setSparkHome(System.getenv("SPARK_HOME")) 79 | } 80 | new SparkNotebookContext(conf) 81 | } 82 | 83 | protected def getMaster(): String = { 84 | val master = { 85 | val envMaster = sys.env.get("MASTER") 86 | val propMaster = sys.props.get("spark.master") 87 | propMaster.orElse(envMaster).getOrElse("local[*]") 88 | } 89 | master 90 | } 91 | 92 | } -------------------------------------------------------------------------------- /ansible/sparknotebook: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # sparknotebook \ 4 | # 5 | # chkconfig: 2345 20 80 6 | # description: Spark Notebook Interactive 7 | # 8 | 9 | ### BEGIN INIT INFO 10 | # Provides: sparknotebook 11 | # Required-Start: $remote_fs $syslog 12 | # Required-Stop: $remote_fs $syslog 13 | # Default-Start: 2 3 4 5 14 | # Default-Stop: 0 1 6 15 | # Should-Start: 16 | # Should-Stop: 17 | # Short-Description: Spark Notebook Interactive 18 | # Description: Spark Notebook Interactive 19 | ### END INIT INFO 20 | 21 | ### ----------------- 22 | # This script was created using following sources 23 | # 24 | # http://stackoverflow.com/questions/8124345/call-to-daemon-in-a-etc-init-d-script-is-blocking-not-running-in-background 25 | # https://fedoraproject.org/wiki/Packaging:SysVInitScript#Initscript_template 26 | ### ----------------- 27 | 28 | # Source function library. 29 | . /etc/rc.d/init.d/functions 30 | 31 | prog="sparknotebook" 32 | 33 | # FIXME The pid file should be handled by the executed script 34 | # The pid can be filled in in this script 35 | PIDFILE=/var/run/sparknotebook/running.pid 36 | 37 | if [ -z "$DAEMON_USER" ]; then 38 | DAEMON_USER=sparknotebook 39 | fi 40 | 41 | 42 | # smb could define some additional options in $RUN_OPTS 43 | RUN_CMD="ipython notebook --profile sparknotebook" 44 | 45 | [ -e /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog 46 | 47 | lockfile=/var/lock/subsys/$prog 48 | 49 | start() { 50 | echo -n $"Starting $prog: " 51 | 52 | nohup runuser -l $DAEMON_USER -c "${RUN_CMD}" >> /var/log/sparknotebook/daemon.log 2>&1 & 53 | 54 | # The way to go, but doesn't work properly 55 | # If the app creates the pid file this gets messy 56 | # daemon --user $DAEMON_USER --pidfile $PIDFILE $RUN_CMD & 57 | 58 | 59 | retval=$? # last error code 60 | PID=$! # pid of last backgrounded process 61 | [ $retval -eq 0 ] && touch ${lockfile} && success || failure 62 | 63 | # Insert pid into pid file for CentOS killproc function 64 | [ -d "/var/run/sparknotebook" ] || install -d -o "$DAEMON_USER" -m750 "/var/run/sparknotebook" 65 | echo 66 | echo $PID > ${PIDFILE} 67 | return $retval 68 | } 69 | 70 | stop() { 71 | echo -n $"Stopping $prog: " 72 | killproc -p $PIDFILE $prog 73 | retval=$? 74 | [ $retval -eq 0 ] && rm -f $lockfile 75 | return $retval 76 | } 77 | 78 | restart() { 79 | stop 80 | start 81 | } 82 | 83 | reload() { 84 | restart 85 | } 86 | 87 | force_reload() { 88 | restart 89 | } 90 | 91 | rh_status() { 92 | # run checks to determine if the service is running or use generic status 93 | status -p $PIDFILE -l $lockfile $prog 94 | } 95 | 96 | rh_status_q() { 97 | rh_status >/dev/null 2>&1 98 | } 99 | 100 | 101 | case "$1" in 102 | start) 103 | rh_status_q && exit 0 104 | $1 105 | ;; 106 | stop) 107 | rh_status_q || exit 0 108 | $1 109 | ;; 110 | restart) 111 | $1 112 | ;; 113 | reload) 114 | rh_status || exit 7 115 | $1 116 | ;; 117 | force-reload) 118 | force_reload 119 | ;; 120 | status) 121 | rh_status 122 | ;; 123 | condrestart|try-restart) 124 | rh_status || exit 0 125 | restart 126 | ;; 127 | *) 128 | echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" 129 | exit 2 130 | esac 131 | exit $? 132 | -------------------------------------------------------------------------------- /src/test/scala/eleflow/sparknotebook/FuncTestSparkNotebookContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook 17 | 18 | import eleflow.sparknotebook.data.{DataTransformer, Dataset} 19 | import eleflow.sparknotebook.enums.DataSetType 20 | import org.apache.spark.SparkException 21 | import org.scalatest._ 22 | import org.scalatest.mock.MockitoSugar 23 | 24 | /** 25 | * Created by dirceu on 14/10/14. 26 | */ 27 | class FuncTestSparkNotebookContext extends FlatSpec with Matchers with MockitoSugar with BeforeAndAfterWithContext { 28 | this: Suite => 29 | 30 | val uberContext = context 31 | 32 | "Functional SparkNotebookContext" should 33 | "correctly load rdd" in { 34 | 35 | import eleflow.sparknotebook.data.Dataset._ 36 | 37 | val dataset = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv") 38 | 39 | val testDataSet = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile2.csv") 40 | 41 | val unionDataset= DataTransformer.extractStringsFromTrainTestSchema(dataset.toSchemaRDD, testDataSet.toSchemaRDD, 42 | Seq(0)) 43 | val normalized = unionDataset.summarizedColumns.map { 44 | case (colIndex, (size, funcIndex, funcValue)) => (colIndex + 1, (size, funcIndex, funcValue)) 45 | } 46 | val result = DataTransformer.createLabeledPointFromRDD(dataset, Seq(0), normalized,DataSetType.Train,unionDataset.columnsSize) 47 | val all = result.take(3) 48 | val (_, first) = all.head 49 | val (_, second) = all.tail.head 50 | assert(first.label == 1) 51 | assert(first.features.toArray.deep == Array[Double](5.0, 0.0, 1.0, 10.5).deep) 52 | assert(second.label == 2) 53 | assert(second.features.toArray.deep == Array[Double](1.0, 1.0, 0.0, 0.1).deep) 54 | uberContext.clearContext 55 | } 56 | 57 | it should "Throw an exception when process an empty numeric column" in { 58 | 59 | @transient lazy val context = uberContext 60 | 61 | val sc = context.sparkContext 62 | try { 63 | import eleflow.sparknotebook.data.Dataset._ 64 | val dataset = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv") 65 | dataset.take(3) 66 | } catch { 67 | case e: SparkException => { 68 | assert(e.getMessage.contains("UnexpectedFileFormatException")) 69 | } 70 | } 71 | } 72 | 73 | it should "Correct handle empty string values" in { 74 | @transient lazy val context = uberContext 75 | val sc = context.sparkContext 76 | val schemaRdd = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextEmpty.csv").schemaRDD 77 | val result = DataTransformer.createLabeledPointFromRDD(schemaRdd, Seq(0),DataSetType.Train) 78 | 79 | } 80 | 81 | it should "Throw an exception when input have different number of columns" in { 82 | val sc = uberContext.sparkContext() 83 | try { 84 | 85 | val result = context.load(s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv", TestSparkConf.separator) 86 | } catch { 87 | case e: SparkException => 88 | assert(e.getMessage.contains("UnexpectedFileFormatException")) 89 | } 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /ansible/nginx.conf: -------------------------------------------------------------------------------- 1 | # For more information on configuration, see: 2 | # * Official English Documentation: http://nginx.org/en/docs/ 3 | # * Official Russian Documentation: http://nginx.org/ru/docs/ 4 | 5 | user nginx; 6 | worker_processes 1; 7 | 8 | error_log /var/log/nginx/error.log; 9 | #error_log /var/log/nginx/error.log notice; 10 | #error_log /var/log/nginx/error.log info; 11 | 12 | pid /var/run/nginx.pid; 13 | 14 | 15 | events { 16 | worker_connections 1024; 17 | } 18 | 19 | 20 | http { 21 | include /etc/nginx/mime.types; 22 | default_type application/octet-stream; 23 | 24 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 25 | '$status $body_bytes_sent "$http_referer" ' 26 | '"$http_user_agent" "$http_x_forwarded_for"'; 27 | 28 | access_log /var/log/nginx/access.log main; 29 | 30 | sendfile on; 31 | #tcp_nopush on; 32 | 33 | #keepalive_timeout 0; 34 | keepalive_timeout 65; 35 | 36 | #gzip on; 37 | 38 | # Load modular configuration files from the /etc/nginx/conf.d directory. 39 | # See http://nginx.org/en/docs/ngx_core_module.html#include 40 | # for more information. 41 | include /etc/nginx/conf.d/*.conf; 42 | 43 | index index.html index.htm; 44 | 45 | server { 46 | listen 80; 47 | server_name localhost; 48 | 49 | #charset koi8-r; 50 | 51 | #access_log /var/log/nginx/host.access.log main; 52 | 53 | location / { 54 | proxy_pass http://localhost:8888; 55 | proxy_http_version 1.1; 56 | proxy_set_header Upgrade $http_upgrade; 57 | proxy_set_header Connection "upgrade"; 58 | proxy_set_header Origin ""; 59 | } 60 | 61 | location /files/ { 62 | root /; 63 | autoindex on; 64 | } 65 | 66 | # redirect server error pages to the static page /40x.html 67 | # 68 | error_page 404 /404.html; 69 | location = /40x.html { 70 | } 71 | 72 | # redirect server error pages to the static page /50x.html 73 | # 74 | error_page 500 502 503 504 /50x.html; 75 | location = /50x.html { 76 | } 77 | 78 | # proxy the PHP scripts to Apache listening on 127.0.0.1:80 79 | # 80 | #location ~ \.php$ { 81 | # proxy_pass http://127.0.0.1; 82 | #} 83 | 84 | # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 85 | # 86 | #location ~ \.php$ { 87 | # root html; 88 | # fastcgi_pass 127.0.0.1:9000; 89 | # fastcgi_index index.php; 90 | # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; 91 | # include fastcgi_params; 92 | #} 93 | 94 | # deny access to .htaccess files, if Apache's document root 95 | # concurs with nginx's one 96 | # 97 | #location ~ /\.ht { 98 | # deny all; 99 | #} 100 | } 101 | 102 | 103 | # another virtual host using mix of IP-, name-, and port-based configuration 104 | # 105 | #server { 106 | # listen 8000; 107 | # listen somename:8080; 108 | # server_name somename alias another.alias; 109 | # root html; 110 | 111 | # location / { 112 | # } 113 | #} 114 | 115 | 116 | # HTTPS server 117 | # 118 | #server { 119 | # listen 443; 120 | # server_name localhost; 121 | # root html; 122 | 123 | # ssl on; 124 | # ssl_certificate cert.pem; 125 | # ssl_certificate_key cert.key; 126 | 127 | # ssl_session_timeout 5m; 128 | 129 | # ssl_protocols SSLv2 SSLv3 TLSv1; 130 | # ssl_ciphers HIGH:!aNULL:!MD5; 131 | # ssl_prefer_server_ciphers on; 132 | 133 | # location / { 134 | # } 135 | #} 136 | 137 | } 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Notebook 2 | 3 | [![Build Status][build-badge]][build-url] 4 | [![License][license-badge]][license-url] 5 | 6 | The Spark Notebook project is fast way of getting a [Spark](http://spark.apache.org/) cluster up and running on [AWS](http://aws.amazon.com) with the friendly [IPython](http://ipython.org) interface. 7 | 8 | ## Before you start 9 | You'll need 10 | 11 | 1. to have [Docker installed](https://docs.docker.com/installation/) (recommended) or [no docker setup](nodocker.md) 12 | 1. [AWS access keys](http://aws.amazon.com/developers/access-keys) 13 | 1. One [AWS keypair](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair) 14 | 15 | 16 | ## Setup 17 | 1. git clone https://github.com/eleflow/sparknotebook.git 18 | 1. cd sparknotebook 19 | 1. create a aws.deploy.env file with these: 20 | 21 | ```sh 22 | AWS_ACCESS_KEY_ID= 23 | AWS_SECRET_ACCESS_KEY= 24 | AWS_KEY_PAIR= 25 | ``` 26 | 1. Run 27 | 28 | ``` $ docker build --rm -f=aws.deploy.Dockerfile -t=aws.deploy .``` 29 | 30 | ## Running the Notebook on AWS 31 | 32 | 1. Run `sudo docker run -it --env-file ./aws.deploy.env --volume $PWD:/sparknotebook --volume $HOME/.ssh:/.ssh aws.deploy` and if all goes well you will see the ip of your sparknotebook server in a line like this 33 | ```sh 34 | ... 35 | 36 | PLAY RECAP ******************************************************************** 37 | 52.10.183.42 : ok=21 changed=3 unreachable=0 failed=0 38 | ``` 39 | 1. Where 52.10.183.42 will be replaced with another ip address. Put that ip address on your browser to get access to the notebook 40 | 41 | ## Spark Notebook 42 | 43 | Spark Notebook kernel is deployed into your server, and you can access it through the port 80, using an HTTP browser. 44 | The initial notebook state is showed in the picture below: 45 | 46 | ![Alt text](/../images/images/EmptyNotebook.png?raw=true "Initial state of a Spark Notebook") 47 | 48 | To start a new notebook, just click in the New Notebook button, and you will be redirected to a new tab, containing an empty notebook. 49 | The notebook is a code container that contains multiple TextArea components, where you can insert any kind of Scala code, including multi lines scripts. To execute the desired code, put the focus into the code TextArea component and hit Shift + ENTER or click in the play button (positioned at the notebook Header). Each time that you submit a code to the notebook, it will be compiled and if it compiles, it will be executed. 50 | 51 | ## Cluster Settings 52 | 53 | One of the cluster settings you are likely to change is the number of slaves. To change it to 30, you can run this code on the Spark Noteook 54 | ```scala 55 | ClusterSettings.coreInstanceCount = 30 // Number of workers available in your cluster - default to 3 56 | ``` 57 | To see other settings see [ClusterSettings](src/main/scala/eleflow/sparknotebook/SparkNotebookContext.scala) 58 | ## SparkContext 59 | A SparkContext can be accessed with: 60 | ```scala 61 | sparkContext 62 | ``` 63 | This is a method of SparkNotebookContext and it provisions the machines and sets up the cluster the first time it runs. An example of output of this method is showed below: 64 | 65 | ![Alt text](/../images/images/ClusterInstantiation.png?raw=true "Sample output of a cluster instantiation") 66 | 67 | ## Shutdown 68 | 69 | To shutdown the cluster and terminate the cluster master and slaves run: 70 | ```scala 71 | terminate 72 | ``` 73 | 74 | ## Monitoring 75 | ### Ganglia 76 | 77 | The master instance of your cluster also has a monitoring tool named Ganglia installed and it's address is displayed when you create the SparkContext. 78 | Ganglia is a useful tool that help you to monitor the CPU, memory and disk usage, displaying graphs of this components. JVM data like, gc executions. It's very useful to help you to setup the correct cluster size, for your tasks. 79 | The ganglia address is printed in the screen during the cluster instantiation. It's always deployed to the masterhost:5080/ganglia address. 80 | It's important to note that the information showed at ganglia has a little delay. 81 | 82 | # Local build 83 | 84 | To build and run locally go [here](buildlocal.md) 85 | 86 | # License 87 | 88 | This project is distributed under Apache License Version 2.0 89 | 90 | [build-badge]: https://travis-ci.org/eleflow/sparknotebook.svg?branch=master 91 | [build-url]: https://travis-ci.org/eleflow/sparknotebook 92 | [license-badge]: https://img.shields.io/badge/License-Apache%202-blue.svg?style=flat 93 | [license-url]: LICENSE 94 | -------------------------------------------------------------------------------- /sbt/sbt/bin/sbt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | ### ------------------------------- ### 5 | ### Helper methods for BASH scripts ### 6 | ### ------------------------------- ### 7 | 8 | realpath () { 9 | ( 10 | TARGET_FILE="$1" 11 | FIX_CYGPATH="$2" 12 | 13 | cd "$(dirname "$TARGET_FILE")" 14 | TARGET_FILE=$(basename "$TARGET_FILE") 15 | 16 | COUNT=0 17 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 18 | do 19 | TARGET_FILE=$(readlink "$TARGET_FILE") 20 | cd "$(dirname "$TARGET_FILE")" 21 | TARGET_FILE=$(basename "$TARGET_FILE") 22 | COUNT=$(($COUNT + 1)) 23 | done 24 | 25 | # make sure we grab the actual windows path, instead of cygwin's path. 26 | if [[ "x$FIX_CYGPATH" != "x" ]]; then 27 | echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")" 28 | else 29 | echo "$(pwd -P)/$TARGET_FILE" 30 | fi 31 | ) 32 | } 33 | 34 | 35 | # Uses uname to detect if we're in the odd cygwin environment. 36 | is_cygwin() { 37 | local os=$(uname -s) 38 | case "$os" in 39 | CYGWIN*) return 0 ;; 40 | *) return 1 ;; 41 | esac 42 | } 43 | 44 | # TODO - Use nicer bash-isms here. 45 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi) 46 | 47 | 48 | # This can fix cygwin style /cygdrive paths so we get the 49 | # windows style paths. 50 | cygwinpath() { 51 | local file="$1" 52 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 53 | echo $(cygpath -w $file) 54 | else 55 | echo $file 56 | fi 57 | } 58 | 59 | . "$(dirname "$(realpath "$0")")/sbt-launch-lib.bash" 60 | 61 | 62 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" 63 | declare -r sbt_opts_file=".sbtopts" 64 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts" 65 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt" 66 | 67 | usage() { 68 | cat < path to global settings/plugins directory (default: ~/.sbt) 77 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 78 | -ivy path to local Ivy repository (default: ~/.ivy2) 79 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 80 | -no-share use all local caches; no sharing 81 | -no-global uses global caches, but does not use global ~/.sbt directory. 82 | -jvm-debug Turn on JVM debugging, open at the given port. 83 | -batch Disable interactive mode 84 | 85 | # sbt version (default: from project/build.properties if present, else latest release) 86 | -sbt-version use the specified version of sbt 87 | -sbt-jar use the specified jar as the sbt launcher 88 | -sbt-rc use an RC version of sbt 89 | -sbt-snapshot use a snapshot version of sbt 90 | 91 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 92 | -java-home alternate JAVA_HOME 93 | 94 | # jvm options and output control 95 | JAVA_OPTS environment variable, if unset uses "$java_opts" 96 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts" 97 | .sbtopts if this file exists in the current directory, it is 98 | prepended to the runner args 99 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args 100 | -Dkey=val pass -Dkey=val directly to the java runtime 101 | -J-X pass option -X directly to the java runtime 102 | (-J is stripped) 103 | -S-X add -X to sbt's scalacOptions (-S is stripped) 104 | 105 | In the case of duplicated or conflicting options, the order above 106 | shows precedence: JAVA_OPTS lowest, command line options highest. 107 | EOM 108 | } 109 | 110 | 111 | 112 | process_my_args () { 113 | while [[ $# -gt 0 ]]; do 114 | case "$1" in 115 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; 116 | -no-share) addJava "$noshare_opts" && shift ;; 117 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; 118 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; 119 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; 120 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; 121 | -batch) exec (t._1 + 1, t._2 )) 41 | val columnsSize = summarizedColumns.map(_._2._1).sum().toInt 42 | val targetIndex=train.columnIndexOf(target) 43 | val idIndex = test.columnIndexOf(id) 44 | (createLabeledPointFromRDD(train.sliceByName(excludes = Seq(id)), Seq(targetIndex-1), summarizedColumns, DataSetType.Train, columnsSize), 45 | createLabeledPointFromRDD(test, Seq(idIndex), summarizedColumns, DataSetType.Test, columnsSize)) 46 | } 47 | 48 | def createLabeledPointFromRDD(schemaRDD: Dataset, target: Seq[Int], datasetType: DataSetType.Types): RDD[(Map[Double,Any], LabeledPoint)] = { 49 | createLabeledPointFromRDD(schemaRDD, target, schemaRDD.summarizedColumns, datasetType, schemaRDD.columnsSize.toInt - 1) 50 | } 51 | 52 | def createLabeledPointFromRDD(rdd: Dataset, target: Seq[Int], 53 | normalized: RDD[(Int, (Int, (Any => Int), (Any => Double)))], 54 | dataSetType: DataSetType.Types, columnsSize: Int): RDD[(Map[Double,Any], LabeledPoint)] = { 55 | val (fields, idField) = rdd.schema.fields.zipWithIndex.partition(f => !target.contains(f._2)) 56 | val normalizedStrings = rdd.context.broadcast(normalized.collectAsMap()) 57 | rdd.zipWithIndex.map { 58 | case (row, rowIndex) => 59 | val norm = normalizedStrings.value 60 | val normValues = fields.map { 61 | case (fieldType, index) => 62 | norm.get(index).map { 63 | f => 64 | (f._1, f._2.apply(row(index)), f._3.apply(row(index))) 65 | }.getOrElse( 66 | throw new UnexpectedValueException(s"Unexpected String Value exception ${row(index)}")) 67 | } 68 | 69 | val (_, indexes, values) = normValues.tail.scanLeft((normValues.head))((b, a) => (b._1 + a._1, (b._1 + a._2), a._3)).filter(_._3 != 0).unzip3 70 | val rowIndexD = rowIndex.toDouble +1 71 | (idField.head._1.dataType) match { 72 | case (StringType) => { 73 | dataSetType match { 74 | case DataSetType.Test => (Map(rowIndexD -> row(target.head)), LabeledPoint(rowIndexD,Vectors.sparse(columnsSize, indexes.toArray, values.toArray))) 75 | case DataSetType.Train => (Map(rowIndexD -> row(target.head)), LabeledPoint(rowIndexD,Vectors.sparse(columnsSize, indexes.toArray, values.toArray))) 76 | } 77 | } 78 | case _ => { 79 | dataSetType match { 80 | case DataSetType.Train => (Map(rowIndexD -> row(target.head) ), LabeledPoint(toDouble(row(target.head)), Vectors.sparse(columnsSize, indexes.toArray, values.toArray))) 81 | case DataSetType.Test => (Map(rowIndexD -> row(target.head) ), LabeledPoint(rowIndexD, Vectors.sparse(columnsSize, indexes.toArray, values.toArray))) 82 | } 83 | } 84 | } 85 | } 86 | } 87 | 88 | def extractStringsFromTrainTestSchema(trainDataSet: Dataset, testDataSet: Dataset, target: Seq[Int]): Dataset = { 89 | val rdd = trainDataSet.slice(excludes = target) 90 | rdd.unionAll(testDataSet) 91 | } 92 | 93 | def toDouble(toConvert: Any): Double = { 94 | toConvert match { 95 | case v: Int => v.toDouble 96 | case v: Long => v.toDouble 97 | case v: BigDecimal => v.toDouble 98 | case v: Double => v 99 | case v: Timestamp => (v.getTime / 3600000).toDouble 100 | case v: String => v.toDouble 101 | case v: Byte => v.toDouble 102 | case v: Boolean => v match { 103 | case true => 1d 104 | case false => 0d 105 | } 106 | case _ => throw new Exception(toConvert.toString) 107 | } 108 | } 109 | 110 | def mapStringIdsToInt(rdd: SchemaRDD, columns: Seq[String]): Seq[Int] = rdd.schema.fields.zipWithIndex. 111 | filter(f => columns.contains(f._1.name)).map(_._2) 112 | 113 | 114 | def mapIdsToInt(rdd: SchemaRDD, columns: Seq[Union[IS]]): Seq[Int] = { 115 | columns.headOption.map { 116 | _.value[Int].map { 117 | _ => columns.map(_.value[Int].get) 118 | }.getOrElse { 119 | mapStringIdsToInt(rdd, columns.map(_.value[String].get)) 120 | } 121 | }.getOrElse(Seq.empty[Int]) 122 | } 123 | } 124 | 125 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/util/DateTimeParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.util 17 | 18 | /** 19 | * Created by dirceu on 24/02/15. 20 | */ 21 | import java.nio.charset.StandardCharsets 22 | import eleflow.sparknotebook.enums.PeriodOfDay 23 | import org.apache.spark.SparkFiles 24 | 25 | import scala.collection.JavaConversions._ 26 | import java.nio.file.{FileSystems, Files} 27 | import java.text.ParseException 28 | 29 | import org.joda.time.{ DateTime} 30 | import org.joda.time.format.DateTimeFormat 31 | 32 | import scala.util.{Success, Try} 33 | 34 | object DateTimeParser extends Serializable { 35 | 36 | def parse(dateString: String): Option[DateTime] = { 37 | val dateFormat: Option[String] = readDateFormat.orElse(determineDateFormat(dateString)) 38 | dateFormat.flatMap { f => 39 | Try { 40 | parse(dateString, dateFormat) 41 | } match { 42 | case Success(s) => s 43 | case _ => None 44 | } 45 | } 46 | } 47 | 48 | def parse(dateString: String, dateFormat: String): Option[DateTime] = { 49 | val formatter = DateTimeFormat.forPattern(dateFormat).withZoneUTC() 50 | return Some(formatter.parseDateTime(dateString)) 51 | } 52 | 53 | def parse(dateString: String, dateFormatOption: Option[String]): Option[DateTime] = { 54 | dateFormatOption match { 55 | case Some(dateFormat) => 56 | parse(dateString, dateFormat) 57 | case None => 58 | parse(dateString) 59 | } 60 | 61 | } 62 | 63 | def isValidDate(dateString: String): Boolean = parse(dateString).isDefined 64 | 65 | def isValidDate(dateString: String, dateFormat: String): Boolean = { 66 | try { 67 | parse(dateString, dateFormat) 68 | return true 69 | } 70 | catch { 71 | case e: ParseException => { 72 | return false 73 | } 74 | } 75 | } 76 | 77 | def determineDateFormat(dateString: String): Option[String] = DATE_FORMAT_REGEXPS.keySet.filter( 78 | regexp => dateString.toLowerCase.matches(regexp)).headOption.flatMap(DATE_FORMAT_REGEXPS.get(_)) 79 | 80 | private final val DATE_FORMAT_REGEXPS: Map[String, String] = Map( 81 | "^\\d{8}$" -> "yyyyMMdd", 82 | """^\d{1,2}-\d{1,2}-\d{4}$""" -> "dd-MM-yyyy", 83 | """^\d{4}-\d{1,2}-\d{1,2}$""" -> "yyyy-MM-dd", 84 | """^\d{1,2}/\d{1,2}/\d{4}$""" -> "MM/dd/yyyy", 85 | """^\d{4}/\d{1,2}/\d{1,2}$""" -> "yyyy/MM/dd", 86 | """^\d{1,2}\s[a-z]{3}\s\d{4}$""" -> "dd MMM yyyy", 87 | """^\d{1,2}\s[a-z]{4,}\s\d{4}$""" -> "dd MMMM yyyy", 88 | """^\d{12}$""" -> """yyyyMMddHHmm""", 89 | """^\d{8}\s\d{4}$""" -> """yyyyMMdd HHmm""", 90 | """^\d{1,2}-\d{1,2}-\d{4}\s\d{1,2}:\d{2}$""" -> "dd-MM-yyyy HH:mm", 91 | """^\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{2}$""" -> "yyyy-MM-dd HH:mm", 92 | """^\d{1,2}/\d{1,2}/\\d{4}\s\d{1,2}:\d{2}$""" -> "MM/dd/yyyy HH:mm", 93 | """^\d{4}/\d{1,2}/\\d{1,2}\s\d{1,2}:\d{2}$""" -> "yyyy/MM/dd HH:mm", 94 | """^\d{1,2}\s[a-z]{3}\s\d{4}\s\d{1,2}:\d{2}$""" -> "dd MMM yyyy HH:mm", 95 | """^\d{1,2}\s[a-z]{4,}\s\d{4}\s\d{1,2}:\d{2}$""" -> "dd MMMM yyyy HH:mm", 96 | """^\d{14}$""" -> """yyyyMMddHHmmss""", 97 | """^\d{8}\\s\d{6}$""" -> """yyyyMMdd HHmmss""", 98 | """^\d{1,2}-\d{1,2}-\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd-MM-yyyy HH:mm:ss", 99 | """^\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{2}:\d{2}$""" -> "yyyy-MM-dd HH:mm:ss", 100 | """^\d{1,2}/\d{1,2}/\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "MM/dd/yyyy HH:mm:ss", 101 | """^\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{2}:\d{2}$""" -> "yyyy/MM/dd HH:mm:ss", 102 | """^\d{1,2}\s[a-z]{3}\s\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd MMM yyyy HH:mm:ss", 103 | """^\d{1,2}\s[a-z]{4,}\s\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd MMMM yyyy HH:mm:ss") 104 | 105 | def period(date: DateTime): PeriodOfDay.PeriodOfDay = { 106 | date.getHourOfDay() match { 107 | case hour if (hour < 6) => PeriodOfDay.Dawn 108 | case hour if (hour < 12) => PeriodOfDay.Morning 109 | case hour if (hour < 18) => PeriodOfDay.Afternoon 110 | case _ => PeriodOfDay.Evening 111 | } 112 | } 113 | 114 | lazy val dateFormatFilePath = FileSystems.getDefault().getPath(SparkNotebookConfig.tempFolder, SparkNotebookConfig.propertyFolder, 115 | SparkNotebookConfig.dateFormatFileName) 116 | 117 | private lazy val propertyFolderPath = FileSystems.getDefault.getPath(SparkNotebookConfig.tempFolder, SparkNotebookConfig.propertyFolder) 118 | 119 | def applyDateFormat(dateFormat: String) = { 120 | if (Files.notExists(propertyFolderPath)) { 121 | Files.createDirectory(propertyFolderPath) 122 | } 123 | Files.deleteIfExists(dateFormatFilePath) 124 | Files.write(dateFormatFilePath, dateFormat.getBytes) 125 | } 126 | 127 | private def readDateFormat = { 128 | val clusterFilePath = FileSystems.getDefault.getPath(SparkFiles.get(SparkNotebookConfig.dateFormatFileName)) 129 | if (Files.exists(clusterFilePath)) Files.readAllLines(clusterFilePath, StandardCharsets.UTF_8).headOption 130 | else None 131 | } 132 | 133 | } 134 | 135 | /* 136 | * Licensed to the Apache Software Foundation (ASF) under one or more 137 | * contributor license agreements. See the NOTICE file distributed with 138 | * this work for additional information regarding copyright ownership. 139 | * The ASF licenses this file to You under the Apache License, Version 2.0 140 | * (the "License"); you may not use this file except in compliance with 141 | * the License. You may obtain a copy of the License at 142 | * 143 | * http://www.apache.org/licenses/LICENSE-2.0 144 | * 145 | * Unless required by applicable law or agreed to in writing, software 146 | * distributed under the License is distributed on an "AS IS" BASIS, 147 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148 | * See the License for the specific language governing permissions and 149 | * limitations under the License. 150 | */ 151 | final class DateTimeParser { 152 | 153 | } -------------------------------------------------------------------------------- /sbt/sbt/bin/sbt-launch-lib.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | 4 | # A library to simplify using the SBT launcher from other packages. 5 | # Note: This should be used by tools like giter8/conscript etc. 6 | 7 | # TODO - Should we merge the main SBT script with this library? 8 | 9 | declare -a residual_args 10 | declare -a java_args 11 | declare -a scalac_args 12 | declare -a sbt_commands 13 | declare java_cmd=java 14 | declare java_version 15 | declare -r sbt_bin_dir="$(dirname "$(realpath "$0")")" 16 | declare -r sbt_home="$(dirname "$sbt_bin_dir")" 17 | 18 | echoerr () { 19 | echo 1>&2 "$@" 20 | } 21 | vlog () { 22 | [[ $verbose || $debug ]] && echoerr "$@" 23 | } 24 | dlog () { 25 | [[ $debug ]] && echoerr "$@" 26 | } 27 | 28 | jar_file () { 29 | echo "$(cygwinpath "${sbt_home}/bin/sbt-launch.jar")" 30 | } 31 | 32 | acquire_sbt_jar () { 33 | sbt_jar="$(jar_file)" 34 | 35 | if [[ ! -f "$sbt_jar" ]]; then 36 | echoerr "Could not find launcher jar: $sbt_jar" 37 | exit 2 38 | fi 39 | } 40 | 41 | execRunner () { 42 | # print the arguments one to a line, quoting any containing spaces 43 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 44 | for arg; do 45 | if printf "%s\n" "$arg" | grep -q ' '; then 46 | printf "\"%s\"\n" "$arg" 47 | else 48 | printf "%s\n" "$arg" 49 | fi 50 | done 51 | echo "" 52 | } 53 | 54 | # THis used to be exec, but we loose the ability to re-hook stty then 55 | # for cygwin... Maybe we should flag the feature here... 56 | "$@" 57 | } 58 | 59 | addJava () { 60 | dlog "[addJava] arg = '$1'" 61 | java_args=( "${java_args[@]}" "$1" ) 62 | } 63 | addSbt () { 64 | dlog "[addSbt] arg = '$1'" 65 | sbt_commands=( "${sbt_commands[@]}" "$1" ) 66 | } 67 | addResidual () { 68 | dlog "[residual] arg = '$1'" 69 | residual_args=( "${residual_args[@]}" "$1" ) 70 | } 71 | addDebugger () { 72 | addJava "-agentlib:jdwp:transport=dt_socket,server=y,suspend=n,address=$1" 73 | } 74 | 75 | get_mem_opts () { 76 | # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings. 77 | # The reason is the Xms/Xmx, if they don't line up, cause errors. 78 | if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then 79 | echo "" 80 | else 81 | # a ham-fisted attempt to move some memory settings in concert 82 | # so they need not be messed around with individually. 83 | local mem=${1:-1024} 84 | local codecache=$(( $mem / 8 )) 85 | (( $codecache > 128 )) || codecache=128 86 | (( $codecache < 512 )) || codecache=512 87 | local class_metadata_size=$(( $codecache * 2 )) 88 | local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize") 89 | 90 | echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m" 91 | fi 92 | } 93 | 94 | require_arg () { 95 | local type="$1" 96 | local opt="$2" 97 | local arg="$3" 98 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 99 | echo "$opt requires <$type> argument" 100 | exit 1 101 | fi 102 | } 103 | 104 | is_function_defined() { 105 | declare -f "$1" > /dev/null 106 | } 107 | 108 | process_args () { 109 | while [[ $# -gt 0 ]]; do 110 | case "$1" in 111 | -h|-help) usage; exit 1 ;; 112 | -v|-verbose) verbose=1 && shift ;; 113 | -d|-debug) debug=1 && shift ;; 114 | 115 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; 116 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; 117 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 118 | -batch) exec &1 | awk -F '"' '/version/ {print $2}') 137 | vlog "[process_args] java_version = '$java_version'" 138 | } 139 | 140 | # Detect that we have java installed. 141 | checkJava() { 142 | local required_version="$1" 143 | # Now check to see if it's a good enough version 144 | if [[ "$java_version" == "" ]]; then 145 | echo 146 | echo No java installations was detected. 147 | echo Please go to http://www.java.com/getjava/ and download 148 | echo 149 | exit 1 150 | elif [[ ! "$java_version" > "$required_version" ]]; then 151 | echo 152 | echo The java installation you have is not up to date 153 | echo $script_name requires at least version $required_version+, you have 154 | echo version $java_version 155 | echo 156 | echo Please go to http://www.java.com/getjava/ and download 157 | echo a valid Java Runtime and install before running $script_name. 158 | echo 159 | exit 1 160 | fi 161 | } 162 | 163 | 164 | run() { 165 | # no jar? download it. 166 | [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || { 167 | # still no jar? uh-oh. 168 | echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar" 169 | exit 1 170 | } 171 | 172 | # process the combined args, then reset "$@" to the residuals 173 | process_args "$@" 174 | set -- "${residual_args[@]}" 175 | argumentCount=$# 176 | 177 | # TODO - java check should be configurable... 178 | checkJava "1.6" 179 | 180 | #If we're in cygwin, we should use the windows config, and terminal hacks 181 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 182 | stty -icanon min 1 -echo > /dev/null 2>&1 183 | addJava "-Djline.terminal=jline.UnixTerminal" 184 | addJava "-Dsbt.cygwin=true" 185 | fi 186 | 187 | # run sbt 188 | execRunner "$java_cmd" \ 189 | ${SBT_OPTS:-$default_sbt_opts} \ 190 | $(get_mem_opts $sbt_mem) \ 191 | ${JAVA_OPTS} \ 192 | ${java_args[@]} \ 193 | -jar "$sbt_jar" \ 194 | "${sbt_commands[@]}" \ 195 | "${residual_args[@]}" 196 | 197 | exit_code=$? 198 | 199 | # Clean up the terminal from cygwin hacks. 200 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 201 | stty icanon echo > /dev/null 2>&1 202 | fi 203 | exit $exit_code 204 | } 205 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /src/templates/bash-template: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright 2015 eleflow.com.br. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | ### ------------------------------- ### 20 | ### Helper methods for BASH scripts ### 21 | ### ------------------------------- ### 22 | 23 | die() { 24 | echo "$@" 1>&2 25 | exit 1 26 | } 27 | 28 | realpath () { 29 | ( 30 | TARGET_FILE="$1" 31 | CHECK_CYGWIN="$2" 32 | 33 | cd "$(dirname "$TARGET_FILE")" 34 | TARGET_FILE=$(basename "$TARGET_FILE") 35 | 36 | COUNT=0 37 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 38 | do 39 | TARGET_FILE=$(readlink "$TARGET_FILE") 40 | cd "$(dirname "$TARGET_FILE")" 41 | TARGET_FILE=$(basename "$TARGET_FILE") 42 | COUNT=$(($COUNT + 1)) 43 | done 44 | 45 | if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then 46 | cd "$TARGET_FILE" 47 | TARGET_FILEPATH= 48 | else 49 | TARGET_FILEPATH=/$TARGET_FILE 50 | fi 51 | 52 | # make sure we grab the actual windows path, instead of cygwin's path. 53 | if [[ "x$CHECK_CYGWIN" == "x" ]]; then 54 | echo "$(pwd -P)/$TARGET_FILE" 55 | else 56 | echo $(cygwinpath "$(pwd -P)/$TARGET_FILE") 57 | fi 58 | ) 59 | } 60 | 61 | # TODO - Do we need to detect msys? 62 | 63 | # Uses uname to detect if we're in the odd cygwin environment. 64 | is_cygwin() { 65 | local os=$(uname -s) 66 | case "$os" in 67 | CYGWIN*) return 0 ;; 68 | *) return 1 ;; 69 | esac 70 | } 71 | 72 | # This can fix cygwin style /cygdrive paths so we get the 73 | # windows style paths. 74 | cygwinpath() { 75 | local file="$1" 76 | if is_cygwin; then 77 | echo $(cygpath -w $file) 78 | else 79 | echo $file 80 | fi 81 | } 82 | 83 | # Make something URI friendly 84 | make_url() { 85 | url="$1" 86 | local nospaces=${url// /%20} 87 | if is_cygwin; then 88 | echo "/${nospaces//\\//}" 89 | else 90 | echo "$nospaces" 91 | fi 92 | } 93 | 94 | # This crazy function reads in a vanilla "linux" classpath string (only : are separators, and all /), 95 | # and returns a classpath with windows style paths, and ; separators. 96 | fixCygwinClasspath() { 97 | OLDIFS=$IFS 98 | IFS=":" 99 | read -a classpath_members <<< "$1" 100 | declare -a fixed_members 101 | IFS=$OLDIFS 102 | for i in "${!classpath_members[@]}" 103 | do 104 | fixed_members[i]=$(realpath "${classpath_members[i]}" "fix") 105 | done 106 | IFS=";" 107 | echo "${fixed_members[*]}" 108 | IFS=$OLDIFS 109 | } 110 | 111 | # Fix the classpath we use for cygwin. 112 | fix_classpath() { 113 | cp="$1" 114 | if is_cygwin; then 115 | echo "$(fixCygwinClasspath "$cp")" 116 | else 117 | echo "$cp" 118 | fi 119 | } 120 | # Detect if we should use JAVA_HOME or just try PATH. 121 | get_java_cmd() { 122 | if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then 123 | echo "$JAVA_HOME/bin/java" 124 | else 125 | echo "java" 126 | fi 127 | } 128 | 129 | echoerr () { 130 | echo 1>&2 "$@" 131 | } 132 | vlog () { 133 | [[ $verbose || $debug ]] && echoerr "$@" 134 | } 135 | dlog () { 136 | [[ $debug ]] && echoerr "$@" 137 | } 138 | execRunner () { 139 | # print the arguments one to a line, quoting any containing spaces 140 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 141 | for arg; do 142 | if printf "%s\n" "$arg" | grep -q ' '; then 143 | printf "\"%s\"\n" "$arg" 144 | else 145 | printf "%s\n" "$arg" 146 | fi 147 | done 148 | echo "" 149 | } 150 | 151 | # we use "exec" here for our pids to be accurate. 152 | exec "$@" 153 | } 154 | addJava () { 155 | dlog "[addJava] arg = '$1'" 156 | java_args+=( "$1" ) 157 | } 158 | addApp () { 159 | dlog "[addApp] arg = '$1'" 160 | app_commands+=( "$1" ) 161 | } 162 | addResidual () { 163 | dlog "[residual] arg = '$1'" 164 | residual_args+=( "$1" ) 165 | } 166 | addDebugger () { 167 | addJava "-Xdebug" 168 | addJava "-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" 169 | } 170 | addMemory () { 171 | addJava "-Xmx" 172 | addJava "28g" 173 | } 174 | 175 | # a ham-fisted attempt to move some memory settings in concert 176 | # so they need not be messed around with individually. 177 | get_mem_opts () { 178 | local mem=${1:-2560} 179 | local perm=$(( $mem / 4 )) 180 | (( $perm > 256 )) || perm=256 181 | (( $perm < 1024 )) || perm=1024 182 | local codecache=$(( $perm / 2 )) 183 | 184 | # if we detect any of these settings in ${java_opts} we need to NOT output our settings. 185 | # The reason is the Xms/Xmx, if they don't line up, cause errors. 186 | if [[ "${java_opts}" == *-Xmx* ]] || 187 | [[ "${java_opts}" == *-Xms* ]] || 188 | [[ "${java_opts}" == *-XX:MaxPermSize* ]] || 189 | [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] || 190 | # check java arguments for settings, too 191 | [[ "${java_args[@]}" == *-Xmx* ]] || 192 | [[ "${java_args[@]}" == *-Xms* ]] || 193 | [[ "${java_args[@]}" == *-XX:MaxPermSize* ]] || 194 | [[ "${java_args[@]}" == *-XX:ReservedCodeCacheSize* ]]; 195 | then 196 | echo "" 197 | elif [[ !$no_version_check ]] && [[ "$java_version" > "1.8" ]]; then 198 | echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m" 199 | else 200 | echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m" 201 | fi 202 | } 203 | require_arg () { 204 | local type="$1" 205 | local opt="$2" 206 | local arg="$3" 207 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 208 | die "$opt requires <$type> argument" 209 | fi 210 | } 211 | is_function_defined() { 212 | declare -f "$1" > /dev/null 213 | } 214 | 215 | # Attempt to detect if the script is running via a GUI or not 216 | # TODO - Determine where/how we use this generically 217 | detect_terminal_for_ui() { 218 | [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && { 219 | echo "true" 220 | } 221 | # SPECIAL TEST FOR MAC 222 | [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && { 223 | echo "true" 224 | } 225 | } 226 | 227 | # Processes incoming arguments and places them in appropriate global variables. called by the run method. 228 | process_args () { 229 | local no_more_snp_opts=0 230 | while [[ $# -gt 0 ]]; do 231 | case "$1" in 232 | --) shift && no_more_snp_opts=1 && break ;; 233 | -h|-help) usage; exit 1 ;; 234 | -v|-verbose) verbose=1 && shift ;; 235 | -d|-debug) debug=1 && shift ;; 236 | 237 | -no-version-check) no_version_check=1 && shift ;; 238 | 239 | -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;; 240 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 241 | 242 | -main) custom_mainclass="$2" && shift 2 ;; 243 | 244 | -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;; 245 | 246 | -D*) addJava "$1" && shift ;; 247 | -J*) addJava "${1:2}" && shift ;; 248 | *) addResidual "$1" && shift ;; 249 | esac 250 | done 251 | 252 | if [[ no_more_snp_opts ]]; then 253 | while [[ $# -gt 0 ]]; do 254 | addResidual "$1" && shift 255 | done 256 | fi 257 | 258 | is_function_defined process_my_args && { 259 | myargs=("${residual_args[@]}") 260 | residual_args=() 261 | process_my_args "${myargs[@]}" 262 | } 263 | } 264 | 265 | # Actually runs the script. 266 | run() { 267 | # TODO - check for sane environment 268 | 269 | # process the combined args, then reset "$@" to the residuals 270 | process_args "$@" 271 | set -- "${residual_args[@]}" 272 | argumentCount=$# 273 | 274 | #check for jline terminal fixes on cygwin 275 | if is_cygwin; then 276 | stty -icanon min 1 -echo > /dev/null 2>&1 277 | addJava "-Djline.terminal=jline.UnixTerminal" 278 | addJava "-Dsbt.cygwin=true" 279 | fi 280 | 281 | # check java version 282 | if [[ ! $no_version_check ]]; then 283 | java_version_check 284 | fi 285 | 286 | if [ -n "$custom_mainclass" ]; then 287 | mainclass="$custom_mainclass" 288 | else 289 | mainclass="$app_mainclass" 290 | fi 291 | 292 | # Now we check to see if there are any java opts on the environment. These get listed first, with the script able to override them. 293 | if [[ "$JAVA_OPTS" != "" ]]; then 294 | java_opts="${JAVA_OPTS}" 295 | fi 296 | 297 | # run sbt 298 | execRunner "$java_cmd" \ 299 | $(get_mem_opts $app_mem) \ 300 | ${java_opts[@]} \ 301 | "${java_args[@]}" \ 302 | -cp "$(fix_classpath "${app_home}/../ec2:$lib_dir/*:/opt/spark/lib/*:$lib_dir/../*")" \ 303 | $mainclass \ 304 | "${app_commands[@]}" \ 305 | "${residual_args[@]}" 306 | 307 | local exit_code=$? 308 | if is_cygwin; then 309 | stty icanon echo > /dev/null 2>&1 310 | fi 311 | exit $exit_code 312 | } 313 | 314 | # Loads a configuration file full of default command line options for this script. 315 | loadConfigFile() { 316 | cat "$1" | sed '/^\#/d' 317 | } 318 | 319 | # Now check to see if it's a good enough version 320 | # TODO - Check to see if we have a configured default java version, otherwise use 1.6 321 | java_version_check() { 322 | readonly java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}') 323 | if [[ "$java_version" == "" ]]; then 324 | echo 325 | echo No java installations was detected. 326 | echo Please go to http://www.java.com/getjava/ and download 327 | echo 328 | exit 1 329 | elif [[ ! "$java_version" > "1.6" ]]; then 330 | echo 331 | echo The java installation you have is not up to date 332 | echo $app_name requires at least version 1.6+, you have 333 | echo version $java_version 334 | echo 335 | echo Please go to http://www.java.com/getjava/ and download 336 | echo a valid Java Runtime and install before running $app_name. 337 | echo 338 | exit 1 339 | fi 340 | } 341 | 342 | ### ------------------------------- ### 343 | ### Start of customized settings ### 344 | ### ------------------------------- ### 345 | usage() { 346 | cat < set memory options in MB (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 354 | -main Define a custom main class 355 | -jvm-debug Turn on JVM debugging, open at the given port. 356 | 357 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 358 | -java-home alternate JAVA_HOME 359 | 360 | # jvm options and output control 361 | JAVA_OPTS environment variable, if unset uses "$java_opts" 362 | -Dkey=val pass -Dkey=val directly to the java runtime 363 | -J-X pass option -X directly to the java runtime 364 | (-J is stripped) 365 | 366 | # special option 367 | -- To stop parsing built-in commands from the rest of the command-line. 368 | e.g.) enabling debug and sending -d as app argument 369 | \$ ./start-script -d -- -d 370 | 371 | In the case of duplicated or conflicting options, basically the order above 372 | shows precedence: JAVA_OPTS lowest, command line options highest except "--". 373 | EOM 374 | } 375 | 376 | ### ------------------------------- ### 377 | ### Main script ### 378 | ### ------------------------------- ### 379 | 380 | declare -a residual_args 381 | declare -a java_args 382 | declare -a app_commands 383 | declare -r real_script_path="$(realpath "$0")" 384 | declare -r app_home="$(realpath "$(dirname "$real_script_path")")" 385 | # TODO - Check whether this is ok in cygwin... 386 | declare -r lib_dir="$(realpath "${app_home}/../lib")" 387 | ${{template_declares}} 388 | # java_cmd is overrode in process_args when -java-home is used 389 | declare java_cmd=$(get_java_cmd) 390 | 391 | # if configuration files exist, prepend their contents to $@ so it can be processed by this runner 392 | [[ -f "$script_conf_file" ]] && set -- $(loadConfigFile "$script_conf_file") "$@" 393 | 394 | run "$@" 395 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/SparkNotebookContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook 17 | 18 | import java.io.{FileNotFoundException, InputStream, OutputStream} 19 | import java.net.URI 20 | import com.amazonaws.services.s3.model.{GetObjectRequest, ObjectMetadata, PutObjectRequest, S3Object} 21 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3Client} 22 | import eleflow.sparknotebook.data.Dataset 23 | 24 | import org.apache.commons.io.IOUtils 25 | import org.apache.hadoop.conf.Configuration 26 | import org.apache.hadoop.fs.{FileStatus, FileUtil, FileSystem, Path} 27 | import org.apache.spark.sql.hive.HiveContext 28 | import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 29 | import org.apache.spark.{Logging, SparkConf, SparkContext} 30 | 31 | import scala.annotation.tailrec 32 | import scala.sys.process._ 33 | import scala.util.Try 34 | import scala.util.matching.Regex 35 | 36 | object ClusterSettings { 37 | 38 | var kryoBufferMaxSize: Option[String] = None 39 | var maxResultSize = "2g" 40 | var masterInstanceType = "r3.large" 41 | var coreInstanceType = "r3.large" 42 | var coreInstanceCount = 3 43 | var spotPriceFactor: Option[String] = Some("1.3") 44 | var ec2KeyName: Option[String] = None 45 | var hadoopVersion = "2" 46 | var clusterName = "SparkNotebookCluster" 47 | var region: Option[String] = None 48 | var profile: Option[String] = None 49 | var resume = false 50 | var executorMemory: Option[String] = None 51 | var defaultParallelism: Option[Int] = None 52 | var master: Option[String] = None 53 | 54 | def slavesCores = ClusterSettings.coreInstanceType match { 55 | case s: String if s.endsWith("xlarge") => 4 56 | case s: String if s.endsWith("2xlarge") => 8 57 | case s: String if s.endsWith("4xlarge") => 16 58 | case s: String if s.endsWith("8xlarge") => 32 59 | case _ => 2 60 | } 61 | 62 | def getNumberOfCores = ClusterSettings.coreInstanceCount * slavesCores 63 | } 64 | 65 | 66 | /** 67 | * User: paulomagalhaes 68 | * Date: 8/15/14 12:24 PM 69 | */ 70 | 71 | class SparkNotebookContext(@transient sparkConf: SparkConf) extends Serializable with Logging { 72 | val version = SparkNotebookVersion.version 73 | 74 | protected def this(sparkConf: SparkConf, data: String) = this(sparkConf) 75 | 76 | @transient protected lazy val s3Client: AmazonS3 = new AmazonS3Client() 77 | @transient protected var sc: Option[SparkContext] = None 78 | @transient var _sqlContext: Option[HiveContext] = None 79 | private var _masterHost: Option[String] = None 80 | protected val basePath: String = "/" 81 | 82 | def sparkContext(): SparkContext = sc getOrElse { 83 | val context = if (ClusterSettings.master.isDefined) createSparkContextForProvisionedCluster(sparkConf) 84 | else createSparkContextForNewCluster(sparkConf) 85 | addClasspathToSparkContext(context) 86 | sc = Some(context) 87 | context 88 | } 89 | 90 | def addClasspathToSparkContext(context: SparkContext) { 91 | val jodaJar = "joda-time.joda-time-.*jar".r 92 | val sparkNotebookContextJar = "eleflow.sparknotebook-.*jar".r 93 | val guavaJar = "com.google.guava.*".r 94 | val mySqlDriver = "mysql-connector-java.*".r 95 | val urls = this.getClass().getClassLoader().asInstanceOf[java.net.URLClassLoader].getURLs 96 | val jarUrls = urls.filter(url => 97 | jodaJar.findFirstIn(url.getFile) != None 98 | || sparkNotebookContextJar.findFirstIn(url.getFile) != None 99 | || guavaJar.findFirstIn(url.getFile) != None 100 | || mySqlDriver.findFirstIn(url.getFile) != None) 101 | jarUrls.foreach { url => 102 | logInfo(s"adding ${url.getPath} to spark context jars") 103 | context.addJar(url.getPath) 104 | } 105 | } 106 | 107 | def masterHost(): String = { 108 | return _masterHost match { 109 | case Some(host) => host 110 | case None => { 111 | initHostNames 112 | _masterHost.get 113 | } 114 | } 115 | } 116 | 117 | def initHostNames { 118 | _masterHost = createCluster(); 119 | } 120 | 121 | def masterHost_=(host: String): Unit = _masterHost = Some(host) 122 | 123 | def sqlContext(): HiveContext = { 124 | _sqlContext match { 125 | case None => { 126 | _sqlContext = Some(new HiveContext(sparkContext)); 127 | HiveThriftServer2.startWithContext(_sqlContext.get) 128 | _sqlContext.get 129 | } 130 | case Some(ctx) => ctx 131 | } 132 | } 133 | 134 | def createSparkContextForNewCluster(conf: SparkConf): SparkContext = { 135 | log.info(s"connecting to $masterHost") 136 | conf.setMaster(s"spark://$masterHost:7077") 137 | confSetup(conf) 138 | } 139 | 140 | private def confSetup(conf: SparkConf): SparkContext = { 141 | ClusterSettings.defaultParallelism.map(value => conf.set("spark.default.parallelism", value.toString)) 142 | ClusterSettings.kryoBufferMaxSize.map(value => conf.set("spark.kryoserializer.buffer.max.mb", value.toString)) 143 | //according to keo, in Making Sense of Spark Performance webcast, this codec is better than default 144 | conf.set("spark.io.compression.codec", "lzf") 145 | conf.set("spark.driver.maxResultSize", ClusterSettings.maxResultSize) 146 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 147 | ClusterSettings.executorMemory.foreach(conf.set("spark.executor.memory", _)) 148 | 149 | val defaultConfStream = this.getClass.getClassLoader.getResourceAsStream("spark-defaults.conf") 150 | if (defaultConfStream != null) { 151 | import scala.collection.JavaConversions._ 152 | val defaultConf = IOUtils.readLines(defaultConfStream) 153 | defaultConf.map { line => 154 | val keyValue = line.split("\\s+") 155 | if (keyValue.size == 2) 156 | conf.set(keyValue(0), keyValue(1)) 157 | } 158 | } 159 | //according to keo, in Making Sense of Spark Performance webcast, this codec is better than default 160 | conf.set("spark.io.compression.codec", "lzf") 161 | 162 | ClusterSettings.defaultParallelism.map(value => conf.set("spark.default.parallelism", value.toString)) 163 | ClusterSettings.kryoBufferMaxSize.map(value => conf.set("spark.kryoserializer.buffer.max.mb", value.toString)) 164 | 165 | conf.set("spark.driver.maxResultSize", ClusterSettings.maxResultSize) 166 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 167 | ClusterSettings.executorMemory.foreach(conf.set("spark.executor.memory", _)) 168 | println("sparkcontext") 169 | new SparkContext(conf) 170 | } 171 | 172 | def createSparkContextForProvisionedCluster(conf: SparkConf): SparkContext = { 173 | log.info("connecting to localhost") 174 | conf.setMaster(ClusterSettings.master.get) 175 | confSetup(conf) 176 | } 177 | 178 | def shellRun(command: Seq[String]) = { 179 | val out = new StringBuilder 180 | 181 | val logger = ProcessLogger( 182 | (o: String) => { 183 | out.append(o); 184 | logInfo(o) 185 | }, 186 | (e: String) => { 187 | println(e); 188 | logInfo(e) 189 | }) 190 | command ! logger 191 | out.toString() 192 | } 193 | 194 | def createCluster(): Option[String] = { 195 | 196 | val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath 197 | import ClusterSettings._ 198 | val mandatory = Seq(path, 199 | "--hadoop-major-version", hadoopVersion, 200 | "--master-instance-type", masterInstanceType, 201 | "--slaves", coreInstanceCount.toString, 202 | "--instance-type", coreInstanceType) 203 | val command = mandatory ++ (ec2KeyName match { 204 | case None => Seq[String]() 205 | case Some(ec2KeyName) => Seq("--key-pair", ec2KeyName) 206 | }) ++ (spotPriceFactor match { 207 | case None => Seq[String]() 208 | case Some(spotPrice) => Seq("--spot-price", spotPrice.toString) 209 | }) ++ (region match { 210 | case None => Seq[String]() 211 | case Some(region) => Seq("--region", region.toString) 212 | }) ++ (profile match { 213 | case None => Seq[String]() 214 | case Some(profile) => Seq("--profile", profile.toString) 215 | }) ++ (if (resume) Seq("--resume") else Seq()) 216 | 217 | val output = shellRun((command ++ Seq("launch", clusterName))) 218 | 219 | val pattern = new Regex("Spark standalone cluster started at http://([^:]+):8080") 220 | val host = pattern.findAllIn(output).matchData.map(_.group(1)).next 221 | return Some(host) 222 | } 223 | 224 | def terminate() { 225 | clearContext 226 | val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath 227 | import ClusterSettings._ 228 | 229 | val output = shellRun(Seq(path, "destroy", clusterName)) 230 | _masterHost = None 231 | ClusterSettings.resume = false 232 | } 233 | 234 | def clusterInfo() { 235 | val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath 236 | import ClusterSettings._ 237 | val output = shellRun(Seq(path, "get-master", clusterName)) 238 | } 239 | 240 | def clearContext { 241 | ClusterSettings.resume = true 242 | sc.map { 243 | f => 244 | f.cancelAllJobs() 245 | f.stop() 246 | } 247 | _sqlContext = None 248 | sc = None 249 | } 250 | 251 | def reconnect(): Unit = { 252 | sc.map(_.stop()) 253 | sc = None 254 | _sqlContext = None 255 | } 256 | 257 | def getAllFilesRecursively(path: Path): Seq[String] = { 258 | val fs = path.getFileSystem(new Configuration) 259 | @tailrec 260 | def iter(fs: FileSystem, paths: Seq[Path], result: Seq[String]): Seq[String] = paths match { 261 | case path :: tail => 262 | val children: Seq[FileStatus] = try { 263 | fs.listStatus(path) 264 | } catch { 265 | case e: FileNotFoundException => 266 | // listStatus throws FNFE if the dir is empty 267 | Seq.empty[FileStatus] 268 | } 269 | val (files, directories) = children.partition(_.isFile) 270 | iter(fs, tail ++ directories.map(_.getPath), files.map(_.getPath.toString) ++ result) 271 | case _ => 272 | result 273 | } 274 | iter(fs, Seq(path), Seq()) 275 | } 276 | 277 | def copyDir(input: String, output: String): Unit = { 278 | val from = createPathInstance(input) 279 | 280 | val files = getAllFilesRecursively(from) 281 | val to = output.replaceAll(new URI(input).getPath, "") 282 | copyDir(files, to) 283 | } 284 | 285 | def copyDir(inputFiles: Seq[String], output: String): Unit = { 286 | sparkContext.parallelize(inputFiles).foreach { inputFile => 287 | val from = new URI(inputFile) 288 | 289 | copy(inputFile, s"$output/${from.getPath}") 290 | } 291 | } 292 | 293 | def copy(input: String, output: String): Unit = { 294 | val from = new URI(input) 295 | val to = new URI(output) 296 | val fromScheme = from.getScheme 297 | val toScheme = to.getScheme 298 | val conf = new Configuration() 299 | 300 | (fromScheme, toScheme) match { 301 | case ("s3n" | "s3", "s3n" | "s3") => ??? 302 | case (fromAddr, _) if (fromAddr.startsWith("s3")) => { 303 | val outputPath = createPathInstance(output) 304 | val fs = createPathInstance(output).getFileSystem(conf) 305 | copyFromS3(from, outputPath, fs) 306 | } 307 | case _ => { 308 | val srcPath = createPathInstance(input) 309 | val srcFs = srcPath.getFileSystem(conf) 310 | val dstPath = createPathInstance(output) 311 | val dstFs = dstPath.getFileSystem(conf) 312 | FileUtil.copy(srcFs, srcPath, dstFs, dstPath, false, conf) 313 | } 314 | } 315 | } 316 | 317 | def fs(pathStr: String): FileSystem = { 318 | val path = createPathInstance(pathStr) 319 | path.getFileSystem(new Configuration) 320 | } 321 | 322 | def sql(sql: String) = { 323 | sqlContext().sql(sql) 324 | } 325 | 326 | protected def copyFromS3(input: URI, path: Path, fs: FileSystem): Unit = { 327 | val rangeObjectRequest: GetObjectRequest = new GetObjectRequest(input.getHost, input.getPath.substring(1)) 328 | val inputStream: Try[InputStream] = Try { 329 | 330 | val objectPortion: S3Object = s3Client.getObject(rangeObjectRequest) 331 | objectPortion.getObjectContent() 332 | } 333 | inputStream.map { 334 | in => 335 | val copyResult = Try(fs.create(path)).flatMap { 336 | out => 337 | val copyResult = copyStreams(in, out) 338 | out.close 339 | copyResult 340 | } 341 | in.close 342 | copyResult 343 | }.recover { 344 | case e: Exception => throw e 345 | } 346 | } 347 | 348 | protected def createPathInstance(input: String) = new Path(input) 349 | 350 | protected def copyStreams(in: InputStream, out: OutputStream) = Try(IOUtils.copy(in, out)) 351 | 352 | protected def copyToS3(input: Path, bucket: String, fileName: String): Unit = { 353 | 354 | val objRequest = new PutObjectRequest(bucket, fileName, readFromHDFS(input), new ObjectMetadata()) 355 | s3Client.putObject(objRequest) 356 | } 357 | 358 | private def readFromHDFS(input: Path) = { 359 | val fs = input.getFileSystem(new Configuration) 360 | fs.open(input) 361 | } 362 | 363 | def load(file: String, separator: String = ",") = { 364 | Dataset(this, file, separator) 365 | } 366 | 367 | } 368 | 369 | -------------------------------------------------------------------------------- /src/main/scala/eleflow/sparknotebook/data/Dataset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 eleflow.com.br. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package eleflow.sparknotebook.data 17 | 18 | import java.net.URI 19 | import java.sql.Timestamp 20 | import eleflow.sparknotebook.enums.DataSetType 21 | import eleflow.sparknotebook.{SparkNotebookContext, ClusterSettings} 22 | import eleflow.sparknotebook.exception.{InvalidDataException, UnexpectedFileFormatException} 23 | import eleflow.sparknotebook.util.DateTimeParser 24 | import eleflow.sparknotebook.enums.DateSplitType._ 25 | import eleflow.sparknotebook.enums.DataSetType 26 | import eleflow.sparknotebook.exception.InvalidDataException 27 | import eleflow.sparknotebook.SparkNotebookContext 28 | import eleflow.sparknotebook.util.DateTimeParser 29 | import org.apache.spark.rdd.RDD 30 | import org.apache.spark.sql._ 31 | import org.apache.spark.sql.catalyst.types.{DataType, StructField} 32 | import org.apache.spark.storage.StorageLevel 33 | import org.joda.time.{DateTime, DateTimeZone, Days} 34 | 35 | import scala.collection.immutable.TreeSet 36 | 37 | 38 | /** 39 | * SparkNotebook 40 | * Copyright (C) 2014 eleflow. 41 | * User: paulomagalhaes 42 | * Date: 11/4/14 3:44 PM 43 | */ 44 | 45 | object Dataset { 46 | implicit def DatasetToSchemaRdd(dataset: Dataset): SchemaRDD = dataset.toSchemaRDD() 47 | 48 | implicit def SchemaRddToDataset(schemaRdd: SchemaRDD): Dataset = new Dataset(schemaRdd) 49 | 50 | implicit def FileDatasetToDataset(fileDS: FileDataset): Dataset = new Dataset(fileDS.toSchemaRDD) 51 | 52 | implicit def FileDatasetToSchemaRdd(fileDS: FileDataset): SchemaRDD = fileDS.toSchemaRDD 53 | 54 | def apply(uc: SparkNotebookContext, file: String, separator: String = ",") = { 55 | new FileDataset(uc, file, separator) 56 | } 57 | } 58 | 59 | class Dataset private[data](schemaRdd: SchemaRDD, originalDataset: Option[Dataset] = None, defaultSummarizedColumns: Option[RDD[(Int, (Int, (Any) => Int, (Any) => Double))]] = None) extends Serializable { 60 | originalDataset.map(f => nameSchemaRDD(f.toSchemaRDD)).getOrElse(nameSchemaRDD(schemaRdd)).map(schemaRdd.registerTempTable(_)) 61 | 62 | private def nameSchemaRDD(schemaRDD: SchemaRDD) = { 63 | schemaRDD.name match { 64 | case null => None 65 | case _ => Some(schemaRdd.name) 66 | } 67 | } 68 | 69 | import org.apache.spark.SparkContext._ 70 | 71 | lazy val columnsSize = summarizedColumns.map(_._2._1).sum().toInt 72 | 73 | lazy val summarizedColumns = defaultSummarizedColumns.getOrElse(summarizeColumns.setName("summarizedColumns").cache()) 74 | 75 | lazy val columnIndexOf = this.schema.fieldNames.zipWithIndex.toSet.toMap 76 | 77 | private def summarizeColumns = { 78 | val fieldsTuple = schemaRdd.schema.fields.zipWithIndex.partition(f => f._1.dataType == StringType) 79 | val (stringFields, nonStringFields) = (fieldsTuple._1.map(_._2), fieldsTuple._2.map(_._2)) 80 | val valuex = schemaRdd.flatMap { 81 | row => 82 | stringFields.map { 83 | sf => 84 | (sf, TreeSet(row.getString(sf))) 85 | } 86 | }.reduceByKey(_ ++ _) 87 | val stringFieldsRdd: RDD[(Int, (Int, (Any => Int), (Any => Double)))] = valuex.map { 88 | case (index, values) => (index ->(values.size, values.zipWithIndex.map(f => (f._1, f._2)).toMap, ((_: Any) => 1.0))) 89 | } 90 | val nonStringMap: Seq[(Int, (Int, (Any => Int), (Any => Double)))] = nonStringFields.map { f => (f, (1, ((_: Any) => 0), ((DataTransformer.toDouble _))))} 91 | stringFieldsRdd.union(stringFieldsRdd.context.parallelize(nonStringMap)) 92 | } 93 | 94 | lazy val summarizedColumnsIndex = summarizeColumnsIndex 95 | 96 | private def summarizeColumnsIndex = { 97 | val summarized = summarizedColumns.sortBy(_._1).map { 98 | f => 99 | f._2._2 match { 100 | case m: Map[Any, Int] => (f._1, m.map(value => value._2 ->(this.schema.fieldNames(f._1), value._1.toString))) 101 | case _: (Any => Int) => (f._1, Map(0 ->(this.schema.fieldNames(f._1), ""))) 102 | } 103 | }.collect 104 | summarized.foldLeft(Map.empty[Int, (String, String)])((b, a) => 105 | (b ++ a._2.map(f => (f._1 + b.size -> f._2)))) 106 | } 107 | 108 | var labels = Seq(0) 109 | 110 | def applyColumnNames(columnNames: Seq[String]) = { 111 | val structFields = schemaRdd.schema.fields.zip(columnNames).map { 112 | case (structField, columnName) => 113 | new StructField(columnName, structField.dataType, structField.nullable) 114 | } 115 | val newSchemaStruct = StructType(structFields) 116 | val newSchemaRDD = schemaRdd.sqlContext.applySchema(schemaRdd, newSchemaStruct) 117 | newSchemaRDD.name = this.name 118 | new Dataset(newSchemaRDD, Some(this)) 119 | } 120 | 121 | def applyColumnTypes(columnTypes: Seq[DataType]) = { 122 | val structFields = schemaRdd.schema.fields.zip(columnTypes).map { 123 | case (structField, dataType) => 124 | new StructField(structField.name, dataType, structField.nullable) 125 | } 126 | val newSchema = StructType(structFields) 127 | val newRowRDD = convert(schemaRdd, newSchema) 128 | 129 | val newSchemaRDD = schemaRdd.sqlContext.applySchema(newRowRDD, newSchema) 130 | newSchemaRDD.name = this.name 131 | new Dataset(newSchemaRDD, Some(this)) 132 | } 133 | 134 | def columnTypes(): Seq[DataType] = { 135 | schemaRdd.schema.fields.map(_.dataType) 136 | } 137 | 138 | def columnNames(): Seq[String] = { 139 | schemaRdd.schema.fields.map(_.name) 140 | } 141 | 142 | private def convert(rowRdd: RDD[Row], newSchema: StructType): RDD[Row] = { 143 | rowRdd.map { row => 144 | val values = row.zip(newSchema.fields).map { 145 | case (null, _) => null 146 | case (value: Double, StructField(_, DoubleType, _, _)) => value 147 | case (value: BigDecimal, StructField(_, DecimalType(), _, _)) => value 148 | case (value: Timestamp, StructField(_, TimestampType, _, _)) => value 149 | case (value: Long, StructField(_, LongType, _, _)) => value 150 | case (value: Int, StructField(_, IntegerType, _, _)) => value 151 | case (value: Short, StructField(_, ShortType, _, _)) => value 152 | case (value: Boolean, StructField(_, BooleanType, _, _)) => value 153 | case (value, StructField(_, DecimalType(), _, _)) => BigDecimal(value.toString) 154 | case (value, StructField(_, DoubleType, _, _)) => value.toString.toDouble 155 | case (value, StructField(_, LongType, _, _)) => value.toString.toLong 156 | case (value, StructField(_, IntegerType, _, _)) => value.toString.toInt 157 | case (value, StructField(_, ShortType, _, _)) => value.toString.toShort 158 | //converter de double 159 | case (value, StructField(_, BooleanType, _, _)) => value.toString match { 160 | case "1" | "t" | "true" => true 161 | case "0" | "f" | "false" => false 162 | case a => throw new InvalidDataException(s"$a is an invalid Boolean value") 163 | } 164 | case (value, StructField(_, TimestampType, _, _)) => new Timestamp(DateTimeParser.parse(value.toString).map(_.toDate.getTime).getOrElse(throw new InvalidDataException("Unsupported data format Exception, please specify the date format"))) 165 | case (value, StructField(_, StringType, _, _)) => value.toString 166 | } 167 | Row(values: _*) 168 | } 169 | } 170 | 171 | def sliceByName(includes: Seq[String] = (schemaRdd.schema.fields.map(_.name)), excludes: Seq[String] = Seq[String]()): Dataset = { 172 | val includesIndices = schemaRdd.schema.fields.zipWithIndex.collect { 173 | case (structField, index) if (includes.contains(structField.name) && !excludes.contains(structField.name)) => index 174 | } 175 | slice(includesIndices, Seq[Int]()) 176 | } 177 | 178 | def slice(includes: Seq[Int] = (0 to schemaRdd.schema.fields.size), excludes: Seq[Int] = Seq.empty[Int]): Dataset = { 179 | 180 | val fields = schemaRdd.schema.fields.zipWithIndex.collect { 181 | case (structField, index) if (includes.contains(index) && !excludes.contains(index)) => structField.name; 182 | } 183 | import schemaRdd.sqlContext.symbolToUnresolvedAttribute 184 | val filtered = fields.map(x => symbolToUnresolvedAttribute(Symbol(x))) 185 | val newSchemaRdd = schemaRdd.select(filtered: _*) 186 | new Dataset(newSchemaRdd, None) 187 | } 188 | 189 | def toSchemaRDD(): SchemaRDD = schemaRdd 190 | 191 | def toLabeledPoint = { 192 | DataTransformer.createLabeledPointFromRDD(schemaRdd, labels, summarizedColumns, DataSetType.Test, columnsSize - 1).values 193 | } 194 | 195 | def formatDateValues(index: Int, dateSplitter: Long): SchemaRDD = { 196 | val rdd = schemaRdd.map { f => 197 | val (before, after) = f.toSeq.splitAt(index) 198 | val formattedDate = splitDateValues(f, index, dateSplitter) 199 | Row(before ++ formattedDate ++ after.headOption.map(_ => after.tail).getOrElse(Seq.empty): _*) 200 | } 201 | val (beforeFields, afterFields) = schemaRdd.schema.fields.splitAt(index) 202 | val dateFields = (1 to determineSizeOfSplitter(dateSplitter)).map(index => new StructField(afterFields.head.name + index, IntegerType, false)) 203 | val fields = beforeFields ++ dateFields ++ afterFields.headOption.map(_ => afterFields.tail).getOrElse(Seq.empty) 204 | val newSchema = StructType(fields) 205 | val newRowRDD = convert(rdd, newSchema) 206 | 207 | val newSchemaRDD = schemaRdd.sqlContext.applySchema(newRowRDD, newSchema) 208 | newSchemaRDD.name = this.name 209 | new Dataset(newSchemaRDD, Some(this)) 210 | } 211 | 212 | type DateSplitterColumnSize = (Long, Long, Int) => Int 213 | type NoSplitterColumnSize = (Long, Int) => Int 214 | 215 | private def splitVerifier: DateSplitterColumnSize = (dateSplitter: Long, verifier: Long, value: Int) => 216 | if (contains(dateSplitter, verifier)) { 217 | value + 1 218 | } else value 219 | 220 | private def noSplit: NoSplitterColumnSize = (dateSplitter: Long, value: Int) => 221 | if (contains(dateSplitter, NoSplit)) { 222 | 0 223 | } else value 224 | 225 | private def determineSizeOfSplitter(dateSplitter: Long) = 226 | splitVerifier(dateSplitter, Period, 227 | splitVerifier(dateSplitter, DayOfAWeek, 228 | noSplit(dateSplitter, 0) 229 | ) 230 | ) 231 | 232 | 233 | val dayZero = new DateTime(1970, 1, 1, 0, 0, 0) 234 | 235 | type DateTimeToInt = DateTime => Int 236 | 237 | type RowDateSplitter = (Long, DateTimeToInt, Seq[Int]) => Seq[Int] 238 | 239 | val daysBetween: DateTimeToInt = { 240 | case d: DateTime => Days.daysBetween(dayZero, d).getDays 241 | } 242 | val getDayOfAWeek: DateTimeToInt = { 243 | case d: DateTime => d.getDayOfWeek 244 | } 245 | val period: DateTimeToInt = { 246 | case d: DateTime => DateTimeParser.period(d).id 247 | } 248 | 249 | protected def splitDateValues(line: Row, index: Int, dateSplitter: Long) = { 250 | def splitDateValues: RowDateSplitter = { 251 | (verifier: Long, datetimefunc: DateTimeToInt, seq: Seq[Int]) => 252 | if (contains(dateSplitter, verifier)) { 253 | val dateValue = if (line.isNullAt(index)) dayZero else new DateTime(line(index).asInstanceOf[Timestamp].getTime, DateTimeZone.UTC) 254 | seq ++ Seq(datetimefunc(dateValue)) 255 | } else seq 256 | } 257 | splitDateValues(Period, period, splitDateValues(DayOfAWeek, getDayOfAWeek, splitDateValues(NoSplit, daysBetween, Seq.empty[Int]))) 258 | } 259 | 260 | 261 | 262 | 263 | def translateCorrelation(array: Array[(Double, Int)]) = { 264 | array.map { 265 | f => summarizedColumns.map { 266 | g => g 267 | } 268 | } 269 | } 270 | } 271 | 272 | class FileDataset protected[data](@transient uc: SparkNotebookContext, file: String, separator: String = ",", header: Option[String] = None) extends Serializable { 273 | 274 | lazy val numberOfPartitions = 4 * (ClusterSettings.getNumberOfCores) 275 | 276 | lazy val columnTypes: Array[DataType] = typeLine.map(dataType) 277 | 278 | lazy val typeLine: Array[String] = extractFirstCompleteLine(originalRdd) 279 | 280 | lazy val columnNames: Array[String] = headerOrFirstLine().split(separator, -1) 281 | 282 | lazy val firstLine: String = loadedRDD.first 283 | 284 | lazy val loadedRDD = { 285 | println(s"localFileName:$localFileName") 286 | val file = uc.sparkContext.textFile(localFileName) 287 | println("file") 288 | file 289 | } 290 | 291 | lazy val localFileName: String = { 292 | uc.sparkContext() // make sure that the cluster is up 293 | val uri = Some(new URI(file)) 294 | val destURI = uri.filter { f => f.getScheme() != null && f.getScheme().startsWith("s3")}.map { vl => 295 | val destURI = s"hdfs:///tmp${vl.getPath()}" 296 | uc.copy(file, destURI) 297 | destURI 298 | }.getOrElse(file) 299 | destURI 300 | } 301 | 302 | lazy val originalRdd: RDD[Array[String]] = initOriginalRdd(headerOrFirstLine(), localFileName) 303 | 304 | def headerOrFirstLine(): String = { 305 | header.getOrElse(firstLine) 306 | } 307 | 308 | def initOriginalRdd(header: String, rdd: RDD[String]): RDD[Array[String]] = { 309 | val localHeader = header 310 | val oRdd = rdd.filter(line => line != localHeader).map(_.split(separator, -1)) 311 | oRdd.setName(localFileName) 312 | oRdd.cache 313 | 314 | } 315 | 316 | def initOriginalRdd(header: String, localFileName: String): RDD[Array[String]] = { 317 | initOriginalRdd(header, loadedRDD) 318 | } 319 | 320 | private def dataType(data: String): DataType = { 321 | val double = """[+-]?\d*\.?\d*E?\d{1,4}""" 322 | val intNumber = "-?\\d{1,9}" // more then 9 it cannot be int 323 | val longNumber = "-?\\d{10,18}" // more then 19 it cannot be long 324 | if (data.matches(intNumber)) 325 | LongType // TODO: To return IntType the whole data set (or sample) needs to be analyzed. 326 | else if (data.matches(longNumber)) 327 | LongType 328 | else if (data.matches(double)) 329 | DecimalType() 330 | else 331 | parse(data).getOrElse(StringType) 332 | } 333 | 334 | protected def parse(data: String): Option[DataType] = DateTimeParser.isValidDate(data) match { 335 | case true => Some(TimestampType) 336 | case false => None 337 | } 338 | 339 | lazy val schemaRDD: SchemaRDD = initSchemaRDD(columnNames, originalRdd, structType) 340 | 341 | 342 | protected def initSchemaRDD(columnNames: Array[String], originalRdd: RDD[Array[String]], structType: StructType): SchemaRDD = { 343 | 344 | val sqlContext = uc.sqlContext 345 | val colNames = columnNames 346 | val rowRdd = originalRdd.map { colValues => 347 | if (colValues.size != colNames.size) throw new UnexpectedFileFormatException(s"Files should have the same number of columns. Line ${colValues.mkString(",")} \n has #${colValues.size} and Header have #${colNames.size}") 348 | val columns = colValues.zip(structType.fields).zipWithIndex.map { case ((value, tp), index) => 349 | //TODO não converter a data aqui 350 | tp.dataType match { 351 | case DecimalType() | DoubleType => value.headOption.map(f => BigDecimal(value.trim)).getOrElse(throw new UnexpectedFileFormatException(s"Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}")) 352 | case LongType => value.headOption.map(f => value.trim.toLong).getOrElse(throw new UnexpectedFileFormatException(s"Long Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}")) 353 | case IntegerType => value.headOption.map(f => value.trim.toInt).getOrElse(throw new UnexpectedFileFormatException(s"Int Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}")) 354 | case TimestampType => new Timestamp(DateTimeParser.parse(value).map(_.toDate.getTime).getOrElse(0)) 355 | case _ => if (value.trim.isEmpty) "0" else value 356 | } 357 | } 358 | Row(columns: _*) 359 | } 360 | val schema = sqlContext.applySchema(rowRdd, structType) 361 | val tableName = extractTableName(file) 362 | schema.name = tableName 363 | schema.registerTempTable(tableName) 364 | schema.repartition(numberOfPartitions) 365 | schema 366 | } 367 | 368 | protected def structType(): StructType = { 369 | if (columnNames.size != typeLine.size || columnNames.size == 0) StructType(List.empty[StructField]) 370 | else { 371 | val fields = columnNames.zip(columnTypes).map { case (columnName, columnType) => new StructField(columnName, columnType, true)} 372 | StructType(fields) 373 | } 374 | } 375 | 376 | protected def extractFirstCompleteLine(dataRdd: RDD[Array[String]]): Array[String] = { 377 | val x = dataRdd.filter { f => 378 | !f.isEmpty && 379 | f.forall(!_.isEmpty) 380 | }.first 381 | x 382 | } 383 | 384 | protected def extractTableName(file: String): String = { 385 | val name = file.split("/").last 386 | val index = name.indexOf(".csv") + name.indexOf(".txt") 387 | name.splitAt(index + 1).productIterator.toList.filter(!_.toString.isEmpty).head.toString 388 | } 389 | 390 | def header(newHeader: String) = { 391 | new FileDataset(uc, file, separator, Some(newHeader)) 392 | } 393 | 394 | def toSchemaRDD = schemaRDD 395 | 396 | def toDataset(): Dataset = { 397 | new Dataset(schemaRDD) 398 | } 399 | } 400 | -------------------------------------------------------------------------------- /ansible/inventory/hosts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | EC2 external inventory script 5 | ================================= 6 | 7 | Generates inventory that Ansible can understand by making API request to 8 | AWS EC2 using the Boto library. 9 | 10 | NOTE: This script assumes Ansible is being executed where the environment 11 | variables needed for Boto have already been set: 12 | export AWS_ACCESS_KEY_ID='AK123' 13 | export AWS_SECRET_ACCESS_KEY='abc123' 14 | 15 | This script also assumes there is an ec2.ini file alongside it. To specify a 16 | different path to ec2.ini, define the EC2_INI_PATH environment variable: 17 | 18 | export EC2_INI_PATH=/path/to/my_ec2.ini 19 | 20 | If you're using eucalyptus you need to set the above variables and 21 | you need to define: 22 | 23 | export EC2_URL=http://hostname_of_your_cc:port/services/Eucalyptus 24 | 25 | For more details, see: http://docs.pythonboto.org/en/latest/boto_config_tut.html 26 | 27 | When run against a specific host, this script returns the following variables: 28 | - ec2_ami_launch_index 29 | - ec2_architecture 30 | - ec2_association 31 | - ec2_attachTime 32 | - ec2_attachment 33 | - ec2_attachmentId 34 | - ec2_client_token 35 | - ec2_deleteOnTermination 36 | - ec2_description 37 | - ec2_deviceIndex 38 | - ec2_dns_name 39 | - ec2_eventsSet 40 | - ec2_group_name 41 | - ec2_hypervisor 42 | - ec2_id 43 | - ec2_image_id 44 | - ec2_instanceState 45 | - ec2_instance_type 46 | - ec2_ipOwnerId 47 | - ec2_ip_address 48 | - ec2_item 49 | - ec2_kernel 50 | - ec2_key_name 51 | - ec2_launch_time 52 | - ec2_monitored 53 | - ec2_monitoring 54 | - ec2_networkInterfaceId 55 | - ec2_ownerId 56 | - ec2_persistent 57 | - ec2_placement 58 | - ec2_platform 59 | - ec2_previous_state 60 | - ec2_private_dns_name 61 | - ec2_private_ip_address 62 | - ec2_publicIp 63 | - ec2_public_dns_name 64 | - ec2_ramdisk 65 | - ec2_reason 66 | - ec2_region 67 | - ec2_requester_id 68 | - ec2_root_device_name 69 | - ec2_root_device_type 70 | - ec2_security_group_ids 71 | - ec2_security_group_names 72 | - ec2_shutdown_state 73 | - ec2_sourceDestCheck 74 | - ec2_spot_instance_request_id 75 | - ec2_state 76 | - ec2_state_code 77 | - ec2_state_reason 78 | - ec2_status 79 | - ec2_subnet_id 80 | - ec2_tenancy 81 | - ec2_virtualization_type 82 | - ec2_vpc_id 83 | 84 | These variables are pulled out of a boto.ec2.instance object. There is a lack of 85 | consistency with variable spellings (camelCase and underscores) since this 86 | just loops through all variables the object exposes. It is preferred to use the 87 | ones with underscores when multiple exist. 88 | 89 | In addition, if an instance has AWS Tags associated with it, each tag is a new 90 | variable named: 91 | - ec2_tag_[Key] = [Value] 92 | 93 | Security groups are comma-separated in 'ec2_security_group_ids' and 94 | 'ec2_security_group_names'. 95 | ''' 96 | 97 | # (c) 2012, Peter Sankauskas 98 | # 99 | # This file is part of Ansible, 100 | # 101 | # Ansible is free software: you can redistribute it and/or modify 102 | # it under the terms of the GNU General Public License as published by 103 | # the Free Software Foundation, either version 3 of the License, or 104 | # (at your option) any later version. 105 | # 106 | # Ansible is distributed in the hope that it will be useful, 107 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 108 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 109 | # GNU General Public License for more details. 110 | # 111 | # You should have received a copy of the GNU General Public License 112 | # along with Ansible. If not, see . 113 | 114 | ###################################################################### 115 | 116 | import sys 117 | import os 118 | import argparse 119 | import re 120 | from time import time 121 | import boto 122 | from boto import ec2 123 | from boto import rds 124 | from boto import route53 125 | import ConfigParser 126 | 127 | try: 128 | import json 129 | except ImportError: 130 | import simplejson as json 131 | 132 | 133 | class Ec2Inventory(object): 134 | def _empty_inventory(self): 135 | return {"_meta" : {"hostvars" : {}}} 136 | 137 | def __init__(self): 138 | ''' Main execution path ''' 139 | 140 | # Inventory grouped by instance IDs, tags, security groups, regions, 141 | # and availability zones 142 | self.inventory = self._empty_inventory() 143 | 144 | # Index of hostname (address) to instance ID 145 | self.index = {} 146 | 147 | # Read settings and parse CLI arguments 148 | self.read_settings() 149 | self.parse_cli_args() 150 | 151 | # Cache 152 | if self.args.refresh_cache: 153 | self.do_api_calls_update_cache() 154 | elif not self.is_cache_valid(): 155 | self.do_api_calls_update_cache() 156 | 157 | # Data to print 158 | if self.args.host: 159 | data_to_print = self.get_host_info() 160 | 161 | elif self.args.list: 162 | # Display list of instances for inventory 163 | if self.inventory == self._empty_inventory(): 164 | data_to_print = self.get_inventory_from_cache() 165 | else: 166 | data_to_print = self.json_format_dict(self.inventory, True) 167 | 168 | print data_to_print 169 | 170 | 171 | def is_cache_valid(self): 172 | ''' Determines if the cache files have expired, or if it is still valid ''' 173 | 174 | if os.path.isfile(self.cache_path_cache): 175 | mod_time = os.path.getmtime(self.cache_path_cache) 176 | current_time = time() 177 | if (mod_time + self.cache_max_age) > current_time: 178 | if os.path.isfile(self.cache_path_index): 179 | return True 180 | 181 | return False 182 | 183 | 184 | def read_settings(self): 185 | ''' Reads the settings from the ec2.ini file ''' 186 | 187 | config = ConfigParser.SafeConfigParser() 188 | ec2_default_ini_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'ec2.ini') 189 | ec2_ini_path = os.environ.get('EC2_INI_PATH', ec2_default_ini_path) 190 | config.read(ec2_ini_path) 191 | 192 | # is eucalyptus? 193 | self.eucalyptus_host = None 194 | self.eucalyptus = False 195 | if config.has_option('ec2', 'eucalyptus'): 196 | self.eucalyptus = config.getboolean('ec2', 'eucalyptus') 197 | if self.eucalyptus and config.has_option('ec2', 'eucalyptus_host'): 198 | self.eucalyptus_host = config.get('ec2', 'eucalyptus_host') 199 | 200 | # Regions 201 | self.regions = [] 202 | configRegions = config.get('ec2', 'regions') 203 | configRegions_exclude = config.get('ec2', 'regions_exclude') 204 | if (configRegions == 'all'): 205 | if self.eucalyptus_host: 206 | self.regions.append(boto.connect_euca(host=self.eucalyptus_host).region.name) 207 | else: 208 | for regionInfo in ec2.regions(): 209 | if regionInfo.name not in configRegions_exclude: 210 | self.regions.append(regionInfo.name) 211 | else: 212 | self.regions = configRegions.split(",") 213 | 214 | # Destination addresses 215 | self.destination_variable = config.get('ec2', 'destination_variable') 216 | self.vpc_destination_variable = config.get('ec2', 'vpc_destination_variable') 217 | 218 | # Route53 219 | self.route53_enabled = config.getboolean('ec2', 'route53') 220 | self.route53_excluded_zones = [] 221 | if config.has_option('ec2', 'route53_excluded_zones'): 222 | self.route53_excluded_zones.extend( 223 | config.get('ec2', 'route53_excluded_zones', '').split(',')) 224 | 225 | # Return all EC2/RDS instances 226 | if config.has_option('ec2', 'all_instances'): 227 | self.all_instances = config.getboolean('ec2', 'all_instances') 228 | else: 229 | self.all_instances = False 230 | if config.has_option('ec2', 'all_rds_instances'): 231 | self.all_rds_instances = config.getboolean('ec2', 'all_rds_instances') 232 | else: 233 | self.all_rds_instances = False 234 | 235 | # Cache related 236 | cache_dir = os.path.expanduser(config.get('ec2', 'cache_path')) 237 | if not os.path.exists(cache_dir): 238 | os.makedirs(cache_dir) 239 | 240 | self.cache_path_cache = cache_dir + "/ansible-ec2.cache" 241 | self.cache_path_index = cache_dir + "/ansible-ec2.index" 242 | self.cache_max_age = config.getint('ec2', 'cache_max_age') 243 | 244 | 245 | 246 | def parse_cli_args(self): 247 | ''' Command line argument processing ''' 248 | 249 | parser = argparse.ArgumentParser(description='Produce an Ansible Inventory file based on EC2') 250 | parser.add_argument('--list', action='store_true', default=True, 251 | help='List instances (default: True)') 252 | parser.add_argument('--host', action='store', 253 | help='Get all the variables about a specific instance') 254 | parser.add_argument('--refresh-cache', action='store_true', default=False, 255 | help='Force refresh of cache by making API requests to EC2 (default: False - use cache files)') 256 | self.args = parser.parse_args() 257 | 258 | 259 | def do_api_calls_update_cache(self): 260 | ''' Do API calls to each region, and save data in cache files ''' 261 | 262 | if self.route53_enabled: 263 | self.get_route53_records() 264 | 265 | for region in self.regions: 266 | self.get_instances_by_region(region) 267 | self.get_rds_instances_by_region(region) 268 | 269 | self.write_to_cache(self.inventory, self.cache_path_cache) 270 | self.write_to_cache(self.index, self.cache_path_index) 271 | 272 | 273 | def get_instances_by_region(self, region): 274 | ''' Makes an AWS EC2 API call to the list of instances in a particular 275 | region ''' 276 | 277 | try: 278 | if self.eucalyptus: 279 | conn = boto.connect_euca(host=self.eucalyptus_host) 280 | conn.APIVersion = '2010-08-31' 281 | else: 282 | conn = ec2.connect_to_region(region) 283 | 284 | # connect_to_region will fail "silently" by returning None if the region name is wrong or not supported 285 | if conn is None: 286 | print("region name: %s likely not supported, or AWS is down. connection to region failed." % region) 287 | sys.exit(1) 288 | 289 | reservations = conn.get_all_instances() 290 | for reservation in reservations: 291 | for instance in reservation.instances: 292 | self.add_instance(instance, region) 293 | 294 | except boto.exception.BotoServerError, e: 295 | if not self.eucalyptus: 296 | print "Looks like AWS is down again:" 297 | print e 298 | sys.exit(1) 299 | 300 | def get_rds_instances_by_region(self, region): 301 | ''' Makes an AWS API call to the list of RDS instances in a particular 302 | region ''' 303 | 304 | try: 305 | conn = rds.connect_to_region(region) 306 | if conn: 307 | instances = conn.get_all_dbinstances() 308 | for instance in instances: 309 | self.add_rds_instance(instance, region) 310 | except boto.exception.BotoServerError, e: 311 | if not e.reason == "Forbidden": 312 | print "Looks like AWS RDS is down: " 313 | print e 314 | sys.exit(1) 315 | 316 | def get_instance(self, region, instance_id): 317 | ''' Gets details about a specific instance ''' 318 | if self.eucalyptus: 319 | conn = boto.connect_euca(self.eucalyptus_host) 320 | conn.APIVersion = '2010-08-31' 321 | else: 322 | conn = ec2.connect_to_region(region) 323 | 324 | # connect_to_region will fail "silently" by returning None if the region name is wrong or not supported 325 | if conn is None: 326 | print("region name: %s likely not supported, or AWS is down. connection to region failed." % region) 327 | sys.exit(1) 328 | 329 | reservations = conn.get_all_instances([instance_id]) 330 | for reservation in reservations: 331 | for instance in reservation.instances: 332 | return instance 333 | 334 | 335 | def add_instance(self, instance, region): 336 | ''' Adds an instance to the inventory and index, as long as it is 337 | addressable ''' 338 | 339 | # Only want running instances unless all_instances is True 340 | if not self.all_instances and instance.state != 'running': 341 | return 342 | 343 | # Select the best destination address 344 | if instance.subnet_id: 345 | dest = getattr(instance, self.vpc_destination_variable) 346 | else: 347 | dest = getattr(instance, self.destination_variable) 348 | 349 | if not dest: 350 | # Skip instances we cannot address (e.g. private VPC subnet) 351 | return 352 | 353 | # Add to index 354 | self.index[dest] = [region, instance.id] 355 | 356 | # Inventory: Group by instance ID (always a group of 1) 357 | self.inventory[instance.id] = [dest] 358 | 359 | # Inventory: Group by region 360 | self.push(self.inventory, region, dest) 361 | 362 | # Inventory: Group by availability zone 363 | self.push(self.inventory, instance.placement, dest) 364 | 365 | # Inventory: Group by instance type 366 | self.push(self.inventory, self.to_safe('type_' + instance.instance_type), dest) 367 | 368 | # Inventory: Group by key pair 369 | if instance.key_name: 370 | self.push(self.inventory, self.to_safe('key_' + instance.key_name), dest) 371 | 372 | # Inventory: Group by security group 373 | try: 374 | for group in instance.groups: 375 | key = self.to_safe("security_group_" + group.name) 376 | self.push(self.inventory, key, dest) 377 | except AttributeError: 378 | print 'Package boto seems a bit older.' 379 | print 'Please upgrade boto >= 2.3.0.' 380 | sys.exit(1) 381 | 382 | # Inventory: Group by tag keys 383 | for k, v in instance.tags.iteritems(): 384 | key = self.to_safe("tag_" + k + "=" + v) 385 | self.push(self.inventory, key, dest) 386 | 387 | # Inventory: Group by Route53 domain names if enabled 388 | if self.route53_enabled: 389 | route53_names = self.get_instance_route53_names(instance) 390 | for name in route53_names: 391 | self.push(self.inventory, name, dest) 392 | 393 | # Global Tag: tag all EC2 instances 394 | self.push(self.inventory, 'ec2', dest) 395 | 396 | self.inventory["_meta"]["hostvars"][dest] = self.get_host_info_dict_from_instance(instance) 397 | 398 | 399 | def add_rds_instance(self, instance, region): 400 | ''' Adds an RDS instance to the inventory and index, as long as it is 401 | addressable ''' 402 | 403 | # Only want available instances unless all_rds_instances is True 404 | if not self.all_rds_instances and instance.status != 'available': 405 | return 406 | 407 | # Select the best destination address 408 | #if instance.subnet_id: 409 | #dest = getattr(instance, self.vpc_destination_variable) 410 | #else: 411 | #dest = getattr(instance, self.destination_variable) 412 | dest = instance.endpoint[0] 413 | 414 | if not dest: 415 | # Skip instances we cannot address (e.g. private VPC subnet) 416 | return 417 | 418 | # Add to index 419 | self.index[dest] = [region, instance.id] 420 | 421 | # Inventory: Group by instance ID (always a group of 1) 422 | self.inventory[instance.id] = [dest] 423 | 424 | # Inventory: Group by region 425 | self.push(self.inventory, region, dest) 426 | 427 | # Inventory: Group by availability zone 428 | self.push(self.inventory, instance.availability_zone, dest) 429 | 430 | # Inventory: Group by instance type 431 | self.push(self.inventory, self.to_safe('type_' + instance.instance_class), dest) 432 | 433 | # Inventory: Group by security group 434 | try: 435 | if instance.security_group: 436 | key = self.to_safe("security_group_" + instance.security_group.name) 437 | self.push(self.inventory, key, dest) 438 | except AttributeError: 439 | print 'Package boto seems a bit older.' 440 | print 'Please upgrade boto >= 2.3.0.' 441 | sys.exit(1) 442 | 443 | # Inventory: Group by engine 444 | self.push(self.inventory, self.to_safe("rds_" + instance.engine), dest) 445 | 446 | # Inventory: Group by parameter group 447 | self.push(self.inventory, self.to_safe("rds_parameter_group_" + instance.parameter_group.name), dest) 448 | 449 | # Global Tag: all RDS instances 450 | self.push(self.inventory, 'rds', dest) 451 | 452 | 453 | def get_route53_records(self): 454 | ''' Get and store the map of resource records to domain names that 455 | point to them. ''' 456 | 457 | r53_conn = route53.Route53Connection() 458 | all_zones = r53_conn.get_zones() 459 | 460 | route53_zones = [ zone for zone in all_zones if zone.name[:-1] 461 | not in self.route53_excluded_zones ] 462 | 463 | self.route53_records = {} 464 | 465 | for zone in route53_zones: 466 | rrsets = r53_conn.get_all_rrsets(zone.id) 467 | 468 | for record_set in rrsets: 469 | record_name = record_set.name 470 | 471 | if record_name.endswith('.'): 472 | record_name = record_name[:-1] 473 | 474 | for resource in record_set.resource_records: 475 | self.route53_records.setdefault(resource, set()) 476 | self.route53_records[resource].add(record_name) 477 | 478 | 479 | def get_instance_route53_names(self, instance): 480 | ''' Check if an instance is referenced in the records we have from 481 | Route53. If it is, return the list of domain names pointing to said 482 | instance. If nothing points to it, return an empty list. ''' 483 | 484 | instance_attributes = [ 'public_dns_name', 'private_dns_name', 485 | 'ip_address', 'private_ip_address' ] 486 | 487 | name_list = set() 488 | 489 | for attrib in instance_attributes: 490 | try: 491 | value = getattr(instance, attrib) 492 | except AttributeError: 493 | continue 494 | 495 | if value in self.route53_records: 496 | name_list.update(self.route53_records[value]) 497 | 498 | return list(name_list) 499 | 500 | 501 | def get_host_info_dict_from_instance(self, instance): 502 | instance_vars = {} 503 | for key in vars(instance): 504 | value = getattr(instance, key) 505 | key = self.to_safe('ec2_' + key) 506 | 507 | # Handle complex types 508 | # state/previous_state changed to properties in boto in https://github.com/boto/boto/commit/a23c379837f698212252720d2af8dec0325c9518 509 | if key == 'ec2__state': 510 | instance_vars['ec2_state'] = instance.state or '' 511 | instance_vars['ec2_state_code'] = instance.state_code 512 | elif key == 'ec2__previous_state': 513 | instance_vars['ec2_previous_state'] = instance.previous_state or '' 514 | instance_vars['ec2_previous_state_code'] = instance.previous_state_code 515 | elif type(value) in [int, bool]: 516 | instance_vars[key] = value 517 | elif type(value) in [str, unicode]: 518 | instance_vars[key] = value.strip() 519 | elif type(value) == type(None): 520 | instance_vars[key] = '' 521 | elif key == 'ec2_region': 522 | instance_vars[key] = value.name 523 | elif key == 'ec2__placement': 524 | instance_vars['ec2_placement'] = value.zone 525 | elif key == 'ec2_tags': 526 | for k, v in value.iteritems(): 527 | key = self.to_safe('ec2_tag_' + k) 528 | instance_vars[key] = v 529 | elif key == 'ec2_groups': 530 | group_ids = [] 531 | group_names = [] 532 | for group in value: 533 | group_ids.append(group.id) 534 | group_names.append(group.name) 535 | instance_vars["ec2_security_group_ids"] = ','.join(group_ids) 536 | instance_vars["ec2_security_group_names"] = ','.join(group_names) 537 | else: 538 | pass 539 | # TODO Product codes if someone finds them useful 540 | #print key 541 | #print type(value) 542 | #print value 543 | 544 | return instance_vars 545 | 546 | def get_host_info(self): 547 | ''' Get variables about a specific host ''' 548 | 549 | if len(self.index) == 0: 550 | # Need to load index from cache 551 | self.load_index_from_cache() 552 | 553 | if not self.args.host in self.index: 554 | # try updating the cache 555 | self.do_api_calls_update_cache() 556 | if not self.args.host in self.index: 557 | # host migh not exist anymore 558 | return self.json_format_dict({}, True) 559 | 560 | (region, instance_id) = self.index[self.args.host] 561 | 562 | instance = self.get_instance(region, instance_id) 563 | return self.json_format_dict(self.get_host_info_dict_from_instance(instance), True) 564 | 565 | def push(self, my_dict, key, element): 566 | ''' Pushed an element onto an array that may not have been defined in 567 | the dict ''' 568 | 569 | if key in my_dict: 570 | my_dict[key].append(element); 571 | else: 572 | my_dict[key] = [element] 573 | 574 | 575 | def get_inventory_from_cache(self): 576 | ''' Reads the inventory from the cache file and returns it as a JSON 577 | object ''' 578 | 579 | cache = open(self.cache_path_cache, 'r') 580 | json_inventory = cache.read() 581 | return json_inventory 582 | 583 | 584 | def load_index_from_cache(self): 585 | ''' Reads the index from the cache file sets self.index ''' 586 | 587 | cache = open(self.cache_path_index, 'r') 588 | json_index = cache.read() 589 | self.index = json.loads(json_index) 590 | 591 | 592 | def write_to_cache(self, data, filename): 593 | ''' Writes data in JSON format to a file ''' 594 | 595 | json_data = self.json_format_dict(data, True) 596 | cache = open(filename, 'w') 597 | cache.write(json_data) 598 | cache.close() 599 | 600 | 601 | def to_safe(self, word): 602 | ''' Converts 'bad' characters in a string to underscores so they can be 603 | used as Ansible groups ''' 604 | 605 | return re.sub("[^A-Za-z0-9\-]", "_", word) 606 | 607 | 608 | def json_format_dict(self, data, pretty=False): 609 | ''' Converts a dict to a JSON object and dumps it as a formatted 610 | string ''' 611 | 612 | if pretty: 613 | return json.dumps(data, sort_keys=True, indent=2) 614 | else: 615 | return json.dumps(data) 616 | 617 | 618 | # Run the script 619 | Ec2Inventory() 620 | 621 | -------------------------------------------------------------------------------- /src/universal/ec2/spark_ec2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import with_statement 5 | 6 | import logging 7 | import os 8 | import pipes 9 | import random 10 | import shutil 11 | import subprocess 12 | import sys 13 | import tempfile 14 | import time 15 | import urllib2 16 | import string 17 | from optparse import OptionParser 18 | from sys import stderr 19 | from string import Template 20 | import boto 21 | from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType 22 | from boto import ec2 23 | import datetime 24 | from datetime import datetime 25 | from datetime import timedelta 26 | 27 | class UsageError(Exception): 28 | pass 29 | 30 | DEFAULT_SPARK_VERSION = "1.2.0" 31 | SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) 32 | MESOS_SPARK_EC2_BRANCH = "v4" 33 | # A URL prefix from which to fetch AMI information 34 | AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH) 35 | 36 | 37 | # Configure and parse our command-line arguments 38 | def parse_args(): 39 | parser = OptionParser(usage="spark-ec2 [options] " 40 | + "\n\n can be: launch, destroy, login, stop, start, get-master", 41 | add_help_option=False) 42 | parser.add_option("-h", "--help", action="help", 43 | help="Show this help message and exit") 44 | parser.add_option("-s", "--slaves", type="int", default=1, 45 | help="Number of slaves to launch (default: 1)") 46 | parser.add_option("-w", "--wait", type="int", default=120, 47 | help="Seconds to wait for nodes to start (default: 120)") 48 | parser.add_option("-k", "--key-pair", 49 | help="Key pair to use on instances") 50 | parser.add_option("-i", "--identity-file", 51 | help="SSH private key file to use for logging into instances") 52 | parser.add_option("-t", "--instance-type", default="m1.large", 53 | help="Type of instance to launch (default: m1.large). " + 54 | "WARNING: must be 64-bit; small instances won't work") 55 | parser.add_option("-m", "--master-instance-type", default="", 56 | help="Master instance type (leave empty for same as instance-type)") 57 | parser.add_option("-r", "--region", help="EC2 region zone to launch instances in") 58 | parser.add_option("-z", "--zone", help="Availability zone to launch instances in, or 'all' to spread " + 59 | "slaves across multiple (an additional $0.01/Gb for bandwidth" + 60 | "between zones applies)") 61 | parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use") 62 | parser.add_option("-p", "--profile", help="AWS profile/role arn to use") 63 | parser.add_option("-v", "--spark-version", default=DEFAULT_SPARK_VERSION, 64 | help="Version of Spark to use: 'X.Y.Z' or a specific git hash") 65 | parser.add_option("--spark-git-repo", 66 | default="https://github.com/apache/spark", 67 | help="Github repo from which to checkout supplied commit hash") 68 | parser.add_option("--hadoop-major-version", default="2", 69 | help="Major version of Hadoop (default: 2)") 70 | parser.add_option("-D", metavar="[ADDRESS:]PORT", dest="proxy_port", 71 | help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + 72 | "the given local address (for use with login)") 73 | parser.add_option("--resume", action="store_true", default=False, 74 | help="Resume installation on a previously launched cluster " + 75 | "(for debugging)") 76 | parser.add_option("--ebs-vol-size", metavar="SIZE", type="int", default=0, 77 | help="Attach a new EBS volume of size SIZE (in GB) to each node as " + 78 | "/vol. The volumes will be deleted when the instances terminate. " + 79 | "Only possible on EBS-backed AMIs.") 80 | parser.add_option("--swap", metavar="SWAP", type="int", default=1024, 81 | help="Swap space to set up per node, in MB (default: 1024)") 82 | parser.add_option("--spot-price", metavar="PRICE", type="float", 83 | help="If specified, launch slaves as spot instances with the given " + 84 | "maximum price (in dollars)") 85 | parser.add_option("--ganglia", action="store_true", default=True, 86 | help="Setup Ganglia monitoring on cluster (default: on). NOTE: " + 87 | "the Ganglia page will be publicly accessible") 88 | parser.add_option("--no-ganglia", action="store_false", dest="ganglia", 89 | help="Disable Ganglia monitoring for the cluster") 90 | parser.add_option("-u", "--user", default="root", 91 | help="The SSH user you want to connect as (default: root)") 92 | parser.add_option("--delete-groups", action="store_true", default=False, 93 | help="When destroying a cluster, delete the security groups that were created") 94 | parser.add_option("--use-existing-master", action="store_true", default=False, 95 | help="Launch fresh slaves, but use an existing stopped master if possible") 96 | parser.add_option("--worker-instances", type="int", default=1, 97 | help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: 1)") 98 | parser.add_option("--master-opts", type="string", default="", 99 | help="Extra options to give to master through SPARK_MASTER_OPTS variable (e.g -Dspark.worker.timeout=180)") 100 | 101 | (opts, args) = parser.parse_args() 102 | if len(args) != 2: 103 | parser.print_help() 104 | sys.exit(1) 105 | (action, cluster_name) = args 106 | 107 | if opts.region is None: 108 | opts.region = region() 109 | 110 | if opts.zone is None: 111 | opts.zone = zone() 112 | 113 | # Boto config check 114 | # http://boto.cloudhackers.com/en/latest/boto_config_tut.html 115 | # home_dir = os.getenv('HOME') 116 | # if home_dir == None or not os.path.isfile(home_dir + '/.boto'): 117 | # if not os.path.isfile('/etc/boto.cfg'): 118 | # if os.getenv('AWS_ACCESS_KEY_ID') == None: 119 | # print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " + 120 | # "must be set") 121 | # sys.exit(1) 122 | # if os.getenv('AWS_SECRET_ACCESS_KEY') == None: 123 | # print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " + 124 | # "must be set") 125 | # sys.exit(1) 126 | return (opts, action, cluster_name) 127 | 128 | 129 | # Get the EC2 security group of the given name, creating it if it doesn't exist 130 | def get_or_make_group(conn, name): 131 | groups = conn.get_all_security_groups() 132 | group = [g for g in groups if g.name == name] 133 | if len(group) > 0: 134 | return group[0] 135 | else: 136 | print "Creating security group " + name 137 | return conn.create_security_group(name, "Spark EC2 group") 138 | 139 | 140 | # Wait for a set of launched instances to exit the "pending" state 141 | # (i.e. either to start running or to fail and be terminated) 142 | def wait_for_instances(conn, instances): 143 | ids = [i.id for i in instances] 144 | while True: 145 | # for i in instances: 146 | # i.update() 147 | # if len([i for i in instances if i.state == 'pending']) > 0: 148 | # 149 | instace_stati = conn.get_all_instance_status(instance_ids=ids) 150 | if len([i for i in instace_stati if i.system_status.details['reachability'] != 'passed' or i.instance_status.details['reachability'] != 'passed']) > 0: 151 | time.sleep(5) 152 | else: 153 | return 154 | 155 | 156 | # Check whether a given EC2 instance object is in a state we consider active, 157 | # i.e. not terminating or terminated. We count both stopping and stopped as 158 | # active since we can restart stopped clusters. 159 | def is_active(instance): 160 | return (instance.state in ['pending', 'running', 'stopping', 'stopped']) 161 | 162 | # Return correct versions of Spark and Shark, given the supplied Spark version 163 | def get_spark_shark_version(opts): 164 | spark_shark_map = { 165 | "0.7.3": "0.7.1", 166 | "0.8.0": "0.8.0", 167 | "0.8.1": "0.8.1", 168 | "0.9.0": "0.9.0", 169 | "0.9.1": "0.9.1", 170 | "1.0.0": "1.0.0", 171 | "1.0.1": "1.0.1", 172 | "1.0.2": "1.0.2", 173 | "1.1.0": "1.1.0", 174 | "1.2.0": "1.2.0" 175 | } 176 | version = opts.spark_version.replace("v", "") 177 | if version not in spark_shark_map: 178 | print >> stderr, "Don't know about Spark version: %s" % version 179 | sys.exit(1) 180 | return (version, spark_shark_map[version]) 181 | 182 | # Attempt to resolve an appropriate AMI given the architecture and 183 | # region of the request. 184 | # Information regarding Amazon Linux AMI instance type was updated on 2014-6-20: 185 | # http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/ 186 | def get_spark_ami(opts): 187 | instance_types = { 188 | "c1.medium": "pvm", 189 | "c1.xlarge": "pvm", 190 | "c3.2xlarge": "pvm", 191 | "c3.4xlarge": "pvm", 192 | "c3.8xlarge": "pvm", 193 | "c3.large": "pvm", 194 | "c3.xlarge": "pvm", 195 | "cc1.4xlarge": "hvm", 196 | "cc2.8xlarge": "hvm", 197 | "cg1.4xlarge": "hvm", 198 | "cr1.8xlarge": "hvm", 199 | "hi1.4xlarge": "pvm", 200 | "hs1.8xlarge": "pvm", 201 | "i2.2xlarge": "hvm", 202 | "i2.4xlarge": "hvm", 203 | "i2.8xlarge": "hvm", 204 | "i2.xlarge": "hvm", 205 | "m1.large": "pvm", 206 | "m1.medium": "pvm", 207 | "m1.small": "pvm", 208 | "m1.xlarge": "pvm", 209 | "m2.2xlarge": "pvm", 210 | "m2.4xlarge": "pvm", 211 | "m2.xlarge": "pvm", 212 | "m3.2xlarge": "hvm", 213 | "m3.large": "hvm", 214 | "m3.medium": "hvm", 215 | "m3.xlarge": "hvm", 216 | "r3.2xlarge": "hvm", 217 | "r3.4xlarge": "hvm", 218 | "r3.8xlarge": "hvm", 219 | "r3.large": "hvm", 220 | "r3.xlarge": "hvm", 221 | "t1.micro": "pvm", 222 | "t2.medium": "hvm", 223 | "t2.micro": "hvm", 224 | "t2.small": "hvm", 225 | } 226 | 227 | if opts.instance_type in instance_types: 228 | instance_type = instance_types[opts.instance_type] 229 | else: 230 | instance_type = "pvm" 231 | print >> stderr,\ 232 | "Don't recognize %s, assuming type is pvm" % opts.instance_type 233 | 234 | ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type) 235 | try: 236 | ami = urllib2.urlopen(ami_path).read().strip() 237 | print "Spark AMI: " + ami 238 | except: 239 | print >> stderr, "Could not resolve AMI at: " + ami_path 240 | sys.exit(1) 241 | 242 | return ami 243 | 244 | # Launch a cluster of the given name, by setting up its security groups, 245 | # and then starting new instances in them. 246 | # Returns a tuple of EC2 reservation objects for the master and slaves 247 | # Fails if there already instances running in the cluster's groups. 248 | def launch_cluster(conn, opts, cluster_name): 249 | 250 | #Remove known hosts to avoid "Offending key for IP ..." errors. 251 | known_hosts = os.environ['HOME'] + "/.ssh/known_hosts" 252 | if os.path.isfile(known_hosts): 253 | os.remove(known_hosts) 254 | if opts.key_pair is None: 255 | opts.key_pair = keypair() 256 | if opts.key_pair is None: 257 | print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." 258 | sys.exit(1) 259 | 260 | if opts.profile is None: 261 | opts.profile = profile() 262 | if opts.profile is None: 263 | print >> stderr, "ERROR: No profile found in current host. It be provided with -p option." 264 | sys.exit(1) 265 | 266 | public_key = pub_key() 267 | user_data = Template("""#!/bin/bash 268 | set -e -x 269 | echo '$public_key' >> ~root/.ssh/authorized_keys 270 | echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute(public_key=public_key) 271 | 272 | print "Setting up security groups..." 273 | master_group = get_or_make_group(conn, cluster_name + "-master") 274 | slave_group = get_or_make_group(conn, cluster_name + "-slaves") 275 | sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication") 276 | if master_group.rules == []: # Group was just now created 277 | master_group.authorize(src_group=master_group) 278 | master_group.authorize(src_group=slave_group) 279 | master_group.authorize(src_group=sparknotebook_group) 280 | master_group.authorize('tcp', 22, 22, '0.0.0.0/0') 281 | master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') 282 | master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') 283 | master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') 284 | master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') 285 | master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') 286 | master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') 287 | master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') 288 | master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0') 289 | if opts.ganglia: 290 | master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') 291 | if slave_group.rules == []: # Group was just now created 292 | slave_group.authorize(src_group=master_group) 293 | slave_group.authorize(src_group=slave_group) 294 | slave_group.authorize(src_group=sparknotebook_group) 295 | slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') 296 | slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') 297 | slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') 298 | slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') 299 | slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') 300 | slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') 301 | 302 | if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id): 303 | sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group) 304 | sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group) 305 | 306 | if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id): 307 | sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group) 308 | sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group) 309 | 310 | # Check if instances are already running in our groups 311 | existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, 312 | die_on_error=False) 313 | if existing_slaves or (existing_masters and not opts.use_existing_master): 314 | print >> stderr,("ERROR: There are already instances running in " + 315 | "group %s or %s" % (master_group.name, slave_group.name)) 316 | sys.exit(1) 317 | 318 | # Figure out Spark AMI 319 | if opts.ami is None: 320 | opts.ami = get_spark_ami(opts) 321 | print "Launching instances..." 322 | 323 | try: 324 | image = conn.get_all_images(image_ids=[opts.ami])[0] 325 | except: 326 | print >> stderr,"Could not find AMI " + opts.ami 327 | sys.exit(1) 328 | 329 | # Create block device mapping so that we can add an EBS volume if asked to 330 | block_map = BlockDeviceMapping() 331 | if opts.ebs_vol_size > 0: 332 | device = EBSBlockDeviceType() 333 | device.size = opts.ebs_vol_size 334 | device.delete_on_termination = True 335 | block_map["/dev/sdv"] = device 336 | 337 | # Launch slaves 338 | if opts.spot_price != None: 339 | zones = get_zones(conn, opts) 340 | 341 | num_zones = len(zones) 342 | i = 0 343 | my_req_ids = [] 344 | 345 | for zone in zones: 346 | best_price = find_best_price(conn,opts.instance_type,zone, opts.spot_price) 347 | # Launch spot instances with the requested price 348 | print >> stderr,("Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" % 349 | (opts.slaves, best_price, opts.slaves * best_price)) 350 | 351 | num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) 352 | interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True) 353 | interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) 354 | 355 | slave_reqs = conn.request_spot_instances( 356 | price = best_price, 357 | image_id = opts.ami, 358 | launch_group = "launch-group-%s" % cluster_name, 359 | placement = zone, 360 | count = num_slaves_this_zone, 361 | key_name = opts.key_pair, 362 | instance_type = opts.instance_type, 363 | block_device_map = block_map, 364 | user_data = user_data, 365 | instance_profile_arn = opts.profile, 366 | network_interfaces = interfaces) 367 | my_req_ids += [req.id for req in slave_reqs] 368 | i += 1 369 | 370 | print >> stderr, "Waiting for spot instances to be granted" 371 | try: 372 | while True: 373 | time.sleep(10) 374 | reqs = conn.get_all_spot_instance_requests() 375 | id_to_req = {} 376 | for r in reqs: 377 | id_to_req[r.id] = r 378 | active_instance_ids = [] 379 | for i in my_req_ids: 380 | if i in id_to_req and id_to_req[i].state == "active": 381 | active_instance_ids.append(id_to_req[i].instance_id) 382 | if len(active_instance_ids) == opts.slaves: 383 | print >> stderr, "All %d slaves granted" % opts.slaves 384 | reservations = conn.get_all_instances(active_instance_ids) 385 | slave_nodes = [] 386 | for r in reservations: 387 | slave_nodes += r.instances 388 | break 389 | else: 390 | # print >> stderr, ".", 391 | print "%d of %d slaves granted, waiting longer" % ( 392 | len(active_instance_ids), opts.slaves) 393 | except: 394 | print >> stderr, "Canceling spot instance requests" 395 | conn.cancel_spot_instance_requests(my_req_ids) 396 | # Log a warning if any of these requests actually launched instances: 397 | (master_nodes, slave_nodes) = get_existing_cluster( 398 | conn, opts, cluster_name, die_on_error=False) 399 | running = len(master_nodes) + len(slave_nodes) 400 | if running: 401 | print >> stderr,("WARNING: %d instances are still running" % running) 402 | sys.exit(0) 403 | else: 404 | # Launch non-spot instances 405 | zones = get_zones(conn, opts) 406 | num_zones = len(zones) 407 | i = 0 408 | slave_nodes = [] 409 | for zone in zones: 410 | num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) 411 | if num_slaves_this_zone > 0: 412 | slave_res = image.run(key_name = opts.key_pair, 413 | security_group_ids = [slave_group.id], 414 | instance_type = opts.instance_type, 415 | subnet_id = subnetId(), 416 | placement = zone, 417 | min_count = num_slaves_this_zone, 418 | max_count = num_slaves_this_zone, 419 | block_device_map = block_map, 420 | user_data = user_data, 421 | instance_profile_arn = opts.profile) 422 | slave_nodes += slave_res.instances 423 | print >> stderr,"Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, 424 | zone, slave_res.id) 425 | i += 1 426 | 427 | # Launch or resume masters 428 | if existing_masters: 429 | print "Starting master..." 430 | for inst in existing_masters: 431 | if inst.state not in ["shutting-down", "terminated"]: 432 | inst.start() 433 | master_nodes = existing_masters 434 | else: 435 | master_type = opts.master_instance_type 436 | if master_type == "": 437 | master_type = opts.instance_type 438 | if opts.zone == 'all': 439 | opts.zone = random.choice(conn.get_all_zones()).name 440 | if opts.spot_price != None: 441 | best_price = find_best_price(conn,master_type,opts.zone,opts.spot_price) 442 | # Launch spot instances with the requested price 443 | print >> stderr, ("Requesting master as spot instances with price $%.3f/hour" % (best_price)) 444 | 445 | interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True) 446 | interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) 447 | 448 | master_reqs = conn.request_spot_instances( 449 | price = best_price, 450 | image_id = opts.ami, 451 | launch_group = "launch-group-%s" % cluster_name, 452 | placement = opts.zone, 453 | count = 1, 454 | key_name = opts.key_pair, 455 | instance_type = master_type, 456 | block_device_map = block_map, 457 | user_data = user_data, 458 | instance_profile_arn = opts.profile, 459 | network_interfaces = interfaces) 460 | my_req_ids = [r.id for r in master_reqs] 461 | print >> stderr, "Waiting for spot instance to be granted" 462 | try: 463 | while True: 464 | time.sleep(10) 465 | reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids) 466 | id_to_req = {} 467 | for r in reqs: 468 | id_to_req[r.id] = r 469 | active_instance_ids = [] 470 | for i in my_req_ids: 471 | if i in id_to_req and id_to_req[i].state == "active": 472 | active_instance_ids.append(id_to_req[i].instance_id) 473 | if len(active_instance_ids) == 1: 474 | print >> stderr, "Master granted" 475 | reservations = conn.get_all_instances(active_instance_ids) 476 | master_nodes = [] 477 | for r in reservations: 478 | master_nodes += r.instances 479 | break 480 | else: 481 | # print >> stderr, ".", 482 | print "%d of %d masters granted, waiting longer" % ( 483 | len(active_instance_ids), 1) 484 | except: 485 | print >> stderr, "Canceling spot instance requests" 486 | conn.cancel_spot_instance_requests(my_req_ids) 487 | # Log a warning if any of these requests actually launched instances: 488 | (master_nodes, master_nodes) = get_existing_cluster( 489 | conn, opts, cluster_name, die_on_error=False) 490 | running = len(master_nodes) + len(master_nodes) 491 | if running: 492 | print >> stderr, ("WARNING: %d instances are still running" % running) 493 | sys.exit(0) 494 | else: 495 | master_res = image.run(key_name = opts.key_pair, 496 | security_group_ids = [master_group.id], 497 | instance_type = master_type, 498 | subnet_id = subnetId(), 499 | placement = opts.zone, 500 | min_count = 1, 501 | max_count = 1, 502 | block_device_map = block_map, 503 | user_data = user_data, 504 | instance_profile_arn = opts.profile) 505 | master_nodes = master_res.instances 506 | print >> stderr,"Launched master in %s, regid = %s" % (zone, master_res.id) 507 | # Return all the instances 508 | return (master_nodes, slave_nodes) 509 | 510 | 511 | # Get the EC2 instances in an existing cluster if available. 512 | # Returns a tuple of lists of EC2 instance objects for the masters and slaves 513 | def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): 514 | print >> stderr,"Searching for existing cluster %s ..." % cluster_name 515 | reservations = conn.get_all_instances() 516 | master_nodes = [] 517 | slave_nodes = [] 518 | for res in reservations: 519 | active = [i for i in res.instances if is_active(i)] 520 | for inst in active: 521 | group_names = [g.name for g in inst.groups] 522 | if (cluster_name + "-master") in group_names: 523 | master_nodes.append(inst) 524 | elif (cluster_name + "-slaves") in group_names: 525 | slave_nodes.append(inst) 526 | if any((master_nodes, slave_nodes)): 527 | print "Spark standalone cluster started at http://%s:8080" % master_nodes[0].public_dns_name 528 | print "Spark private ip address %s" % master_nodes[0].private_dns_name 529 | print >> stderr, "Spark standalone cluster started at http://%s:8080" % master_nodes[0].public_dns_name 530 | print >> stderr,("Found %d master(s), %d slaves" % 531 | (len(master_nodes), len(slave_nodes))) 532 | get_master_setup_files(master_nodes[0].public_dns_name, opts) 533 | if opts.ganglia: 534 | print >> stderr,"Ganglia started at http://%s:5080/ganglia" % master_nodes[0].public_dns_name 535 | if master_nodes != [] or not die_on_error: 536 | return (master_nodes, slave_nodes) 537 | else: 538 | if master_nodes == [] and slave_nodes != []: 539 | print "ERROR: Could not find master in group %s-master" %cluster_name 540 | else: 541 | print "ERROR: Could not find any existing cluster" 542 | sys.exit(1) 543 | 544 | 545 | # Deploy configuration files and run setup scripts on a newly launched 546 | # or started EC2 cluster. 547 | def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): 548 | 549 | master_nodes[0].update() 550 | master = master_nodes[0] 551 | print "Spark private ip address %s" % master.private_dns_name 552 | if deploy_ssh_key: 553 | print "Generating cluster's SSH key on master..." 554 | key_setup = """ 555 | [ -f ~/.ssh/id_rsa ] || 556 | (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && 557 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) 558 | """ 559 | ssh(master.private_dns_name, opts, key_setup) 560 | dot_ssh_tar = ssh_read(master.private_dns_name, opts, ['tar', 'c', '.ssh']) 561 | print >> stderr, "Transferring cluster's SSH key to slaves..." 562 | for slave in slave_nodes: 563 | slave.update() 564 | ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar) 565 | 566 | modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', 567 | 'mapreduce', 'spark-standalone'] 568 | 569 | if opts.hadoop_major_version == "1": 570 | modules = filter(lambda x: x != "mapreduce", modules) 571 | 572 | if opts.ganglia: 573 | modules.append('ganglia') 574 | 575 | # NOTE: We should clone the repository before running deploy_files to 576 | # prevent ec2-variables.sh from being overwritten 577 | ssh( 578 | host=master.private_dns_name, 579 | opts=opts, 580 | command="rm -rf spark-ec2" 581 | + " && " 582 | + "git clone https://github.com/paulomagalhaes/spark-ec2.git -b {b}".format(b=MESOS_SPARK_EC2_BRANCH) 583 | ) 584 | 585 | print >> stderr,"Deploying files to master... " 586 | (path, name) = os.path.split(__file__) 587 | deploy_files(conn, path+"/deploy.generic", opts, master_nodes, slave_nodes, modules) 588 | 589 | print >> stderr,"Running setup on master... " 590 | setup_spark_cluster(master, opts) 591 | get_master_setup_files(master.private_dns_name, opts) 592 | print >> stderr,"Done!" 593 | 594 | def get_master_setup_files(master, opts): 595 | scp(master, opts, "spark/lib/datanucleus*.jar", "%s/../lib" % SPARK_EC2_DIR) 596 | scp(master, opts, "spark/conf/*", SPARK_EC2_DIR) 597 | 598 | def setup_standalone_cluster(master, slave_nodes, opts): 599 | slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes]) 600 | ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips)) 601 | ssh(master, opts, "/root/spark/sbin/start-all.sh") 602 | 603 | def setup_spark_cluster(master, opts): 604 | ssh(master.private_dns_name, opts, "chmod u+x spark-ec2/setup.sh") 605 | ssh(master.private_dns_name, opts, "spark-ec2/setup.sh") 606 | print "Spark standalone cluster started at http://%s:8080" % master.public_dns_name 607 | print >> stderr, "Spark standalone cluster started at http://%s:8080" % master.public_dns_name 608 | if opts.ganglia: 609 | print >> stderr,"Ganglia started at http://%s:5080/ganglia" % master.public_dns_name 610 | 611 | 612 | 613 | # Wait for a whole cluster (masters, slaves and ZooKeeper) to start up 614 | def wait_for_cluster(conn, wait_secs, master_nodes, slave_nodes): 615 | print >> stderr,"Waiting for instances to start up..." 616 | time.sleep(5) 617 | wait_for_instances(conn, master_nodes) 618 | wait_for_instances(conn, slave_nodes) 619 | 620 | 621 | # Get number of local disks available for a given EC2 instance type. 622 | def get_num_disks(instance_type): 623 | # From http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html 624 | # Updated 2014-6-20 625 | disks_by_instance = { 626 | "m1.small": 1, 627 | "m1.medium": 1, 628 | "m1.large": 2, 629 | "m1.xlarge": 4, 630 | "t1.micro": 1, 631 | "c1.medium": 1, 632 | "c1.xlarge": 4, 633 | "m2.xlarge": 1, 634 | "m2.2xlarge": 1, 635 | "m2.4xlarge": 2, 636 | "cc1.4xlarge": 2, 637 | "cc2.8xlarge": 4, 638 | "cg1.4xlarge": 2, 639 | "hs1.8xlarge": 24, 640 | "cr1.8xlarge": 2, 641 | "hi1.4xlarge": 2, 642 | "m3.medium": 1, 643 | "m3.large": 1, 644 | "m3.xlarge": 2, 645 | "m3.2xlarge": 2, 646 | "i2.xlarge": 1, 647 | "i2.2xlarge": 2, 648 | "i2.4xlarge": 4, 649 | "i2.8xlarge": 8, 650 | "c3.large": 2, 651 | "c3.xlarge": 2, 652 | "c3.2xlarge": 2, 653 | "c3.4xlarge": 2, 654 | "c3.8xlarge": 2, 655 | "r3.large": 1, 656 | "r3.xlarge": 1, 657 | "r3.2xlarge": 1, 658 | "r3.4xlarge": 1, 659 | "r3.8xlarge": 2, 660 | "g2.2xlarge": 1, 661 | "t1.micro": 0 662 | } 663 | if instance_type in disks_by_instance: 664 | return disks_by_instance[instance_type] 665 | else: 666 | print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1" 667 | % instance_type) 668 | return 1 669 | 670 | 671 | # Deploy the configuration file templates in a given local directory to 672 | # a cluster, filling in any template parameters with information about the 673 | # cluster (e.g. lists of masters and slaves). Files are only deployed to 674 | # the first master instance in the cluster, and we expect the setup 675 | # script to be run on that instance to copy them to other nodes. 676 | def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): 677 | active_master = master_nodes[0].public_dns_name 678 | 679 | num_disks = get_num_disks(opts.instance_type) 680 | hdfs_data_dirs = "/mnt/ephemeral-hdfs/data" 681 | mapred_local_dirs = "/mnt/hadoop/mrlocal" 682 | spark_local_dirs = "/mnt/spark" 683 | if num_disks > 1: 684 | for i in range(2, num_disks + 1): 685 | hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i 686 | mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i 687 | spark_local_dirs += ",/mnt%d/spark" % i 688 | 689 | cluster_url = "%s:7077" % active_master 690 | 691 | if "." in opts.spark_version: 692 | # Pre-built spark & shark deploy 693 | (spark_v, shark_v) = get_spark_shark_version(opts) 694 | else: 695 | # Spark-only custom deploy 696 | spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) 697 | shark_v = "" 698 | modules = filter(lambda x: x != "shark", modules) 699 | 700 | template_vars = { 701 | "master_list": '\n'.join([i.public_dns_name for i in master_nodes]), 702 | "active_master": active_master, 703 | "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]), 704 | "cluster_url": cluster_url, 705 | "hdfs_data_dirs": hdfs_data_dirs, 706 | "mapred_local_dirs": mapred_local_dirs, 707 | "spark_local_dirs": spark_local_dirs, 708 | "swap": str(opts.swap), 709 | "modules": '\n'.join(modules), 710 | "spark_version": spark_v, 711 | "shark_version": shark_v, 712 | "hadoop_major_version": opts.hadoop_major_version, 713 | "metastore_user": "hive", 714 | "metastore_passwd": ''.join(random.SystemRandom().choice(string.uppercase + string.digits) for _ in xrange(10)), 715 | "spark_worker_instances": "%d" % opts.worker_instances, 716 | "spark_master_opts": opts.master_opts 717 | } 718 | 719 | # Create a temp directory in which we will place all the files to be 720 | # deployed after we substitue template parameters in them 721 | print root_dir 722 | tmp_dir = tempfile.mkdtemp() 723 | for path, dirs, files in os.walk(root_dir): 724 | if path.find(".svn") == -1: 725 | dest_dir = os.path.join('/', path[len(root_dir):]) 726 | local_dir = tmp_dir + dest_dir 727 | if not os.path.exists(local_dir): 728 | os.makedirs(local_dir) 729 | for filename in files: 730 | if filename[0] not in '#.~' and filename[-1] != '~': 731 | dest_file = os.path.join(dest_dir, filename) 732 | local_file = tmp_dir + dest_file 733 | with open(os.path.join(path, filename)) as src: 734 | with open(local_file, "w") as dest: 735 | text = src.read() 736 | for key in template_vars: 737 | text = text.replace("{{" + key + "}}", template_vars[key]) 738 | dest.write(text) 739 | dest.close() 740 | # rsync the whole directory over to the master machine 741 | command = [ 742 | 'rsync', '-rv', 743 | '-e', stringify_command(ssh_command(opts)), 744 | "%s/" % tmp_dir, 745 | "%s@%s:/" % (opts.user, active_master) 746 | ] 747 | subprocess.check_call(command) 748 | # Remove the temp directory we created above 749 | shutil.rmtree(tmp_dir) 750 | print tmp_dir 751 | 752 | 753 | def stringify_command(parts): 754 | if isinstance(parts, str): 755 | return parts 756 | else: 757 | return ' '.join(map(pipes.quote, parts)) 758 | 759 | 760 | def ssh_args(opts): 761 | parts = ['-o', 'StrictHostKeyChecking=no', '-o LogLevel=error'] 762 | # parts += ['-i', '~/.ssh/id_rsa'] 763 | return parts 764 | 765 | 766 | def ssh_command(opts): 767 | return ['ssh'] + ssh_args(opts) 768 | 769 | def scp_command(opts): 770 | return ['scp'] + ssh_args(opts) 771 | 772 | def pub_key(): 773 | key_gen = """[ -f ~/.ssh/id_rsa ] || 774 | (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa) 775 | """ 776 | subprocess.check_call(key_gen, shell=True) 777 | return subprocess.Popen("cat ~/.ssh/id_rsa.pub", shell=True, stdout=subprocess.PIPE).communicate()[0] 778 | 779 | def profile(): 780 | return subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/iam/info | grep InstanceProfileArn""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3] 781 | 782 | def region(): 783 | return subprocess.Popen("""curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep region""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3] 784 | 785 | def zone(): 786 | return subprocess.Popen("""curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep availabilityZone""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3] 787 | 788 | def subnetId(): 789 | mac = subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/ | grep /""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("/")[0] 790 | return subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/""" + mac + """/subnet-id/""", shell=True, stdout=subprocess.PIPE).communicate()[0] 791 | 792 | def keypair(): 793 | return subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key""", shell=True, stdout=subprocess.PIPE).communicate()[0].split(" ")[2].strip() 794 | 795 | # Run a command on a host through ssh, retrying up to five times 796 | # and then throwing an exception if ssh continues to fail. 797 | def ssh(host, opts, command): 798 | tries = 0 799 | while True: 800 | try: 801 | #print >> stderr, ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), stringify_command(command)] 802 | return subprocess.check_call( 803 | ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), stringify_command(command)]) 804 | except subprocess.CalledProcessError as e: 805 | if (tries > 10): 806 | print >> stderr,'Failed to SSH to remote host %s after %s retries.' % (host, tries) 807 | # If this was an ssh failure, provide the user with hints. 808 | if e.returncode == 255: 809 | raise UsageError('Failed to SSH to remote host %s.\nPlease check that you have provided the correct --identity-file and --key-pair parameters and try again.' % (host)) 810 | else: 811 | raise e 812 | #print >> stderr,"Error executing remote command, retrying after 30 seconds: {0}".format(e) 813 | time.sleep(30) 814 | tries = tries + 1 815 | 816 | def scp(host, opts, src, target): 817 | tries = 0 818 | while True: 819 | try: 820 | return subprocess.check_call( 821 | scp_command(opts) + ['%s@%s:%s' % (opts.user, host,src), target]) 822 | except subprocess.CalledProcessError as e: 823 | if (tries > 10): 824 | print >> stderr,"Failed to SCP to remote host {0} after r retries.".format(host) 825 | # If this was an ssh failure, provide the user with hints. 826 | if e.returncode == 255: 827 | raise UsageError("Failed to SCP to remote host {0}.\nPlease check that you have provided the correct --identity-file and --key-pair parameters and try again.".format(host)) 828 | else: 829 | raise e 830 | time.sleep(30) 831 | tries = tries + 1 832 | 833 | 834 | # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990) 835 | def _check_output(*popenargs, **kwargs): 836 | if 'stdout' in kwargs: 837 | raise ValueError('stdout argument not allowed, it will be overridden.') 838 | process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) 839 | output, unused_err = process.communicate() 840 | retcode = process.poll() 841 | if retcode: 842 | cmd = kwargs.get("args") 843 | if cmd is None: 844 | cmd = popenargs[0] 845 | raise subprocess.CalledProcessError(retcode, cmd, output=output) 846 | return output 847 | 848 | 849 | def ssh_read(host, opts, command): 850 | return _check_output( 851 | ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)]) 852 | 853 | 854 | def ssh_write(host, opts, command, input): 855 | tries = 0 856 | while True: 857 | proc = subprocess.Popen( 858 | ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)], 859 | stdin=subprocess.PIPE, stderr=subprocess.STDOUT) 860 | proc.stdin.write(input) 861 | proc.stdin.close() 862 | status = proc.wait() 863 | if status == 0: 864 | break 865 | elif (tries > 5): 866 | raise RuntimeError("ssh_write failed with error %s" % proc.returncode) 867 | else: 868 | print >> stderr, "Error {0} while executing remote command, retrying after 30 seconds".format(status) 869 | time.sleep(30) 870 | tries = tries + 1 871 | 872 | 873 | # Gets a list of zones to launch instances in 874 | def get_zones(conn, opts): 875 | if opts.zone == 'all': 876 | zones = [z.name for z in conn.get_all_zones()] 877 | else: 878 | zones = [opts.zone] 879 | return zones 880 | 881 | 882 | # Gets the number of items in a partition 883 | def get_partition(total, num_partitions, current_partitions): 884 | num_slaves_this_zone = total / num_partitions 885 | if (total % num_partitions) - current_partitions > 0: 886 | num_slaves_this_zone += 1 887 | return num_slaves_this_zone 888 | 889 | 890 | def real_main(): 891 | (opts, action, cluster_name) = parse_args() 892 | try: 893 | conn = ec2.connect_to_region(opts.region) 894 | except Exception as e: 895 | print >> stderr,(e) 896 | sys.exit(1) 897 | 898 | # Select an AZ at random if it was not specified. 899 | if opts.zone == "": 900 | opts.zone = random.choice(conn.get_all_zones()).name 901 | 902 | if action == "launch": 903 | if opts.slaves <= 0: 904 | print >> sys.stderr, "ERROR: You have to start at least 1 slave" 905 | sys.exit(1) 906 | if opts.resume: 907 | (master_nodes, slave_nodes) = get_existing_cluster( 908 | conn, opts, cluster_name) 909 | else: 910 | start_secs = time.time() 911 | (master_nodes, slave_nodes) = launch_cluster( 912 | conn, opts, cluster_name) 913 | wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes) 914 | print >> stderr, "Provisioning took %.3f minutes" % ((time.time() - start_secs) / 60.0) 915 | start_secs = time.time() 916 | setup_cluster(conn, master_nodes, slave_nodes, opts, True) 917 | print >> stderr,"Setup took %.3f minutes" % ((time.time() - start_secs)/60.0) 918 | 919 | elif action == "destroy": 920 | (master_nodes, slave_nodes) = get_existing_cluster( 921 | conn, opts, cluster_name, die_on_error=False) 922 | print >> stderr,"Terminating master..." 923 | for inst in master_nodes: 924 | inst.terminate() 925 | print >> stderr,"Terminating slaves..." 926 | for inst in slave_nodes: 927 | inst.terminate() 928 | 929 | # Delete security groups as well 930 | if opts.delete_groups: 931 | print >> stderr,"Deleting security groups (this will take some time)..." 932 | group_names = [cluster_name + "-master", cluster_name + "-slaves"] 933 | 934 | attempt = 1; 935 | while attempt <= 3: 936 | print >> stderr,"Attempt %d" % attempt 937 | groups = [g for g in conn.get_all_security_groups() if g.name in group_names] 938 | success = True 939 | # Delete individual rules in all groups before deleting groups to 940 | # remove dependencies between them 941 | for group in groups: 942 | print >> stderr,"Deleting rules in security group " + group.name 943 | for rule in group.rules: 944 | for grant in rule.grants: 945 | success &= group.revoke(ip_protocol=rule.ip_protocol, 946 | from_port=rule.from_port, 947 | to_port=rule.to_port, 948 | src_group=grant) 949 | 950 | # Sleep for AWS eventual-consistency to catch up, and for instances 951 | # to terminate 952 | time.sleep(30) # Yes, it does have to be this long :-( 953 | for group in groups: 954 | try: 955 | conn.delete_security_group(group.name) 956 | print >> stderr,"Deleted security group " + group.name 957 | except boto.exception.EC2ResponseError: 958 | success = False; 959 | print >> stderr,"Failed to delete security group " + group.name 960 | 961 | # Unfortunately, group.revoke() returns True even if a rule was not 962 | # deleted, so this needs to be rerun if something fails 963 | if success: break; 964 | 965 | attempt += 1 966 | 967 | if not success: 968 | print >> stderr,"Failed to delete all security groups after 3 tries." 969 | print >> stderr,"Try re-running in a few minutes." 970 | 971 | elif action == "login": 972 | (master_nodes, slave_nodes) = get_existing_cluster( 973 | conn, opts, cluster_name) 974 | master = master_nodes[0].public_dns_name 975 | print "Logging into master " + master + "..." 976 | proxy_opt = [] 977 | if opts.proxy_port != None: 978 | proxy_opt = ['-D', opts.proxy_port] 979 | subprocess.check_call( 980 | ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) 981 | 982 | elif action == "get-master": 983 | (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) 984 | print master_nodes[0].public_dns_name 985 | 986 | elif action == "stop": 987 | response = raw_input("Are you sure you want to stop the cluster " + 988 | cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " + 989 | "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" + 990 | "AMAZON EBS IF IT IS EBS-BACKED!!\n" + 991 | "All data on spot-instance slaves will be lost.\n" + 992 | "Stop cluster " + cluster_name + " (y/N): ") 993 | if response == "y": 994 | (master_nodes, slave_nodes) = get_existing_cluster( 995 | conn, opts, cluster_name, die_on_error=False) 996 | print >> stderr,"Stopping master..." 997 | for inst in master_nodes: 998 | if inst.state not in ["shutting-down", "terminated"]: 999 | inst.stop() 1000 | print >> stderr,"Stopping slaves..." 1001 | for inst in slave_nodes: 1002 | if inst.state not in ["shutting-down", "terminated"]: 1003 | if inst.spot_instance_request_id: 1004 | inst.terminate() 1005 | else: 1006 | inst.stop() 1007 | 1008 | elif action == "start": 1009 | (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) 1010 | print >> stderr,"Starting slaves..." 1011 | for inst in slave_nodes: 1012 | if inst.state not in ["shutting-down", "terminated"]: 1013 | inst.start() 1014 | print >> stderr,"Starting master..." 1015 | for inst in master_nodes: 1016 | if inst.state not in ["shutting-down", "terminated"]: 1017 | inst.start() 1018 | wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes) 1019 | setup_cluster(conn, master_nodes, slave_nodes, opts, False) 1020 | 1021 | else: 1022 | print >> stderr,"Invalid action: %s" % action 1023 | sys.exit(1) 1024 | 1025 | def find_best_price(conn,instance,zone, factor): 1026 | last_hour_zone = get_spot_price(conn,zone,datetime.utcnow()-timedelta(hours=1),instance) 1027 | average_price_last_hour = sum(i.price for i in last_hour_zone)/float(len(last_hour_zone)) 1028 | return average_price_last_hour*factor 1029 | 1030 | def get_spot_price(conn,zone,start_date_hour,instance): 1031 | return conn.get_spot_price_history(start_time=start_date_hour.strftime("%Y-%m-%dT%H:%M:%SZ"),end_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),instance_type=instance , product_description="Linux/UNIX",availability_zone=zone) 1032 | 1033 | def main(): 1034 | try: 1035 | real_main() 1036 | except UsageError, e: 1037 | print >> stderr,"\nError:\n", e 1038 | sys.exit(1) 1039 | 1040 | 1041 | if __name__ == "__main__": 1042 | logging.basicConfig() 1043 | main() 1044 | --------------------------------------------------------------------------------