├── ansible
    ├── .gitignore
    ├── nginx.sh
    ├── inventory
    │   ├── local
    │   ├── ec2.ini
    │   └── hosts
    ├── spark-jars.sh
    ├── sparknotebook.yml
    ├── sparknotebook-prov.yml
    ├── sparknotebook.sh
    ├── sparknotebook
    └── nginx.conf
├── .travis.yml
├── sbt
    └── sbt
    │   ├── bin
    │       ├── sbt-launch.jar
    │       ├── sbt.bat
    │       ├── sbt
    │       └── sbt-launch-lib.bash
    │   └── conf
    │       ├── sbtconfig.txt
    │       └── sbtopts
├── src
    ├── test
    │   ├── resources
    │   │   ├── FuncTestSparkNotebookContextFile2.csv
    │   │   ├── FuncTestSparkNotebookContextEmpty.csv
    │   │   └── FuncTestSparkNotebookContextFile1.csv
    │   └── scala
    │   │   └── eleflow
    │   │       └── sparknotebook
    │   │           ├── BeforeAndAfterWithContext.scala
    │   │           └── FuncTestSparkNotebookContext.scala
    ├── main
    │   ├── resources
    │   │   └── log4j.properties
    │   └── scala
    │   │   └── eleflow
    │   │       └── sparknotebook
    │   │           ├── exception
    │   │               ├── UnexpectedValueException.scala
    │   │               ├── InvalidDataException.scala
    │   │               └── UnexpectedFileFormatException.scala
    │   │           ├── enums
    │   │               ├── DataSetType.scala
    │   │               ├── PeriodOfDay.scala
    │   │               └── DateSplitType.scala
    │   │           ├── util
    │   │               ├── SparkNotebookConfig.scala
    │   │               ├── IntStringImplicitTypeConverter.scala
    │   │               └── DateTimeParser.scala
    │   │           ├── Main.scala
    │   │           ├── visualization
    │   │               └── RichDisplay.scala
    │   │           ├── SparkNotebookInterpreter.scala
    │   │           ├── data
    │   │               ├── DataTransformer.scala
    │   │               └── Dataset.scala
    │   │           └── SparkNotebookContext.scala
    ├── universal
    │   └── ec2
    │   │   ├── deploy.generic
    │   │       └── root
    │   │       │   └── spark-ec2
    │   │       │       └── ec2-variables.sh
    │   │   └── spark_ec2.py
    └── templates
    │   └── bash-template
├── .gitignore
├── aws.deploy.Dockerfile
├── nodocker.md
├── project
    ├── build.properties
    └── plugins.sbt
├── aws.deploy.sh
├── README.md
└── LICENSE


/ansible/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | logs
3 | 


--------------------------------------------------------------------------------
/ansible/nginx.sh:
--------------------------------------------------------------------------------
1 | sudo yum -y install nginx


--------------------------------------------------------------------------------
/ansible/inventory/local:
--------------------------------------------------------------------------------
1 | [local]
2 | localhost
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | 
3 | scala:
4 |    - 2.10.4
5 | 
6 | script: "sbt clean scoverage:test"


--------------------------------------------------------------------------------
/sbt/sbt/bin/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eleflow/sparknotebook/HEAD/sbt/sbt/bin/sbt-launch.jar


--------------------------------------------------------------------------------
/src/test/resources/FuncTestSparkNotebookContextFile2.csv:
--------------------------------------------------------------------------------
1 | int,string2,double
2 | 5,vlr1,10.5
3 | 1,vl3,0.1
4 | 8,vlr1,10.0


--------------------------------------------------------------------------------
/src/test/resources/FuncTestSparkNotebookContextEmpty.csv:
--------------------------------------------------------------------------------
1 | id,int,string2,double
2 | 1,5,vlr1,10.5
3 | 2,1,vl3,0.1
4 | 3,8,,10.0


--------------------------------------------------------------------------------
/src/test/resources/FuncTestSparkNotebookContextFile1.csv:
--------------------------------------------------------------------------------
1 | id,int,string2,double
2 | 1,5,vlr1,10.5
3 | 2,1,vl3,0.1
4 | 3,8,vlr1,10.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | metastore_db
2 | derby.log
3 | target
4 | project/project/target
5 | project/target
6 | aws.deploy.env
7 | logs/
8 | *.pem
9 | 


--------------------------------------------------------------------------------
/sbt/sbt/conf/sbtconfig.txt:
--------------------------------------------------------------------------------
 1 | # Set the java args to high
 2 | 
 3 | -Xmx512M
 4 | 
 5 | -XX:MaxPermSize=256m
 6 | 
 7 | -XX:ReservedCodeCacheSize=128m
 8 | 
 9 | 
10 | 
11 | # Set the extra SBT options
12 | 
13 | -Dsbt.log.format=true
14 | 
15 | 


--------------------------------------------------------------------------------
/ansible/spark-jars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget -O /tmp/spark-1.2.0-bin-cdh4.tgz https://s3-us-west-2.amazonaws.com/sparknotebook-public/spark/spark-1.2.0-bin-cdh4.tgz 
3 | tar -xzf /tmp/spark-1.2.0-bin-cdh4.tgz --strip-components 2 --wildcards --no-anchored 'spark-assembly*.jar' 
4 | tar -xzf /tmp/spark-1.2.0-bin-cdh4.tgz --strip-components 2 --wildcards --no-anchored 'datanucleus*.jar' 
5 | mkdir -p /opt/spark
6 | mkdir -p /opt/spark/lib
7 | cp spark-assembly*.jar /opt/spark/lib
8 | cp datanucleus*.jar /opt/spark/lib
9 | 


--------------------------------------------------------------------------------
/aws.deploy.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | MAINTAINER Paulo Magalhaes
 3 | 
 4 | RUN apt-get update && apt-get install -y \
 5 |     python \
 6 |     python-pip \
 7 | 	software-properties-common \
 8 | 	&& apt-add-repository ppa:ansible/ansible \
 9 | 	&& apt-get update && apt-get install -y ansible
10 | 
11 | RUN pip install awscli
12 | RUN pip install boto
13 | ADD aws.deploy.env /tmp/aws.deploy.env
14 | RUN . /tmp/aws.deploy.env && printf "[defaults]\nprivate_key_file=/.ssh/${AWS_KEY_PAIR}.pem\nhost_key_checking=False" > ~/.ansible.cfg
15 | 
16 | ENTRYPOINT /sparknotebook/aws.deploy.sh
17 | 


--------------------------------------------------------------------------------
/nodocker.md:
--------------------------------------------------------------------------------
 1 | # Setup without Docker
 2 | 
 3 | If you want to use Sparknotebook without docker you have to:
 4 |  1. [To install ansible](http://docs.ansible.com/intro_installation.html#installing-the-control-machine)
 5 |  1. [To install boto](http://boto.readthedocs.org/en/latest/getting_started.html#installing-boto)
 6 |  1. [To configure aws credentials in boto](http://boto.readthedocs.org/en/latest/getting_started.html#configuring-boto-credentials)
 7 |  1. [Create a AWS IAM role](http://docs.aws.amazon.com/IAM/latest/UserGuide/roles-creatingrole-service.html) named **dev-ops**
 8 |   with the policies below:
 9 |  
10 |  ```JSON
11 |  {
12 |   "Version": "2012-10-17",
13 |   "Statement": [
14 |     {
15 |       "Action": "ec2:*",
16 |       "Effect": "Allow",
17 |       "Resource": "*"
18 |     }
19 |   ]
20 | }
21 |  ```
22 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootLogger=INFO, file
 3 | 
 4 | # Direct log messages to a log file
 5 | log4j.appender.file=org.apache.log4j.RollingFileAppender
 6 | 
 7 | #Redirect to Tomcat logs folder
 8 | #log4j.appender.file.File=${catalina.home}/logs/logging.log
 9 | 
10 | log4j.appender.file.File=logs/logging.log
11 | log4j.appender.file.MaxFileSize=10MB
12 | log4j.appender.file.MaxBackupIndex=10
13 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
14 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
15 | 
16 | # Settings to quiet third party logs that are too verbose
17 | log4j.logger.org.eclipse.jetty=INFO
18 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=INFO
19 | log4j.logger.org.apache.spark.repl=INFO
20 | 
21 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one or more
 2 | // contributor license agreements.  See the NOTICE file distributed with
 3 | // this work for additional information regarding copyright ownership.
 4 | // The ASF licenses this file to You under the Apache License, Version 2.0
 5 | // (the "License"); you may not use this file except in compliance with
 6 | // the License.  You may obtain a copy of the License at
 7 | //
 8 | //    http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | 
16 | sbt.version=0.13.7
17 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/exception/UnexpectedValueException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.exception
17 | 
18 | /**
19 |  * Created by dirceu on 16/12/14.
20 |  */
21 | class UnexpectedValueException(message:String) extends Exception(message)
22 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/enums/DataSetType.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.enums
17 | 
18 | /**
19 |  * Created by dirceu on 12/12/14.
20 |  */
21 | object DataSetType extends Enumeration{
22 | type Types = Value
23 |   val Train,Test = Value
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/enums/PeriodOfDay.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.enums
17 | 
18 | /**
19 |  * Created by dirceu on 20/02/15.
20 |  */
21 | object PeriodOfDay extends Enumeration {
22 |   type PeriodOfDay = Value
23 |   val Morning, Afternoon, Evening, Dawn = Value
24 | }


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/exception/InvalidDataException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.exception
17 | 
18 | import scala.util.control.NoStackTrace
19 | 
20 | /**
21 |  * Created by dirceu on 04/11/14.
22 |  */
23 | class InvalidDataException (message: String) extends Exception(message) with NoStackTrace {
24 |   def this() = this("")
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/exception/UnexpectedFileFormatException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.exception
17 | 
18 | import scala.util.control.NoStackTrace
19 | 
20 | /**
21 |  * Created by dirceu on 15/10/14.
22 |  */
23 | class UnexpectedFileFormatException(message: String) extends Exception(message) with NoStackTrace {
24 |   def this() = this("")
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/util/SparkNotebookConfig.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.util
17 | /**
18 |  * Created by dirceu on 05/12/14.
19 |  */
20 | object SparkNotebookConfig {
21 |   val dateFormatFileName = "spark.properties"
22 |   val propertyFolder = s"sparknotebook-${DateTimeParser.hashCode()}"
23 |   val tempFolder = System.getProperty("java.io.tmpdir")
24 | }


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/enums/DateSplitType.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.enums
17 | 
18 | /**
19 |  * Created by dirceu on 02/12/14.
20 |  */
21 | object DateSplitType {
22 |   def contains(compare:Long,compareTo:Long) = (compare & compareTo) == compareTo
23 |   val NoSplit = 1 << 0
24 |   val Period = 1 << 1
25 |   val DayOfAWeek = 1 << 2
26 |   val WorkNonWorkingDay = 1 << 3
27 |   val PeriodDayOfAWeek = Period | DayOfAWeek
28 | }
29 | 


--------------------------------------------------------------------------------
/sbt/sbt/conf/sbtopts:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------ #
 2 | #  The SBT Configuration file.                     #
 3 | # ------------------------------------------------ #
 4 | 
 5 | 
 6 | # Disable ANSI color codes
 7 | #
 8 | #-no-colors
 9 | 
10 | # Starts sbt even if the current directory contains no sbt project.
11 | #
12 | -sbt-create
13 | 
14 | # Path to global settings/plugins directory (default: ~/.sbt)
15 | #
16 | #-sbt-dir  /etc/sbt
17 | 
18 | # Path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
19 | #
20 | #-sbt-boot ~/.sbt/boot  
21 | 
22 | # Path to local Ivy repository (default: ~/.ivy2)
23 | #
24 | #-ivy ~/.ivy2
25 | 
26 | # set memory options
27 | #
28 | #-mem   <integer>  
29 | 
30 | # Use local caches for projects, no sharing.
31 | #
32 | #-no-share
33 | 
34 | # Put SBT in offline mode.
35 | #
36 | #-offline
37 | 
38 | # Sets the SBT version to use.
39 | #-sbt-version  0.11.3
40 | 
41 | # Scala version (default: latest release)
42 | #
43 | #-scala-home <path>        
44 | #-scala-version <version>
45 | 
46 | # java version (default: java from PATH, currently $(java -version |& grep version))
47 | #
48 | #-java-home <path>
49 | 
50 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/util/IntStringImplicitTypeConverter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.util
17 | 
18 | import com.gensler.scalavro.util.Union
19 | import com.gensler.scalavro.util.Union.union
20 | /**
21 |  * Created by dirceu on 03/11/14.
22 |  */
23 | object IntStringImplicitTypeConverter {
24 | 
25 |   type IS = union[Int]#or[String]
26 |   implicit def convIntToUnion(i:Int): Union[IS] = {
27 |     val union = new Union[IS]
28 |       union.assign(i)
29 |     union
30 |   }
31 |   implicit def convStringToUnion(i:String): Union[IS] = {
32 |     val union = new Union[IS]
33 |     union.assign(i)
34 |     union
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one or more
 2 | // contributor license agreements.  See the NOTICE file distributed with
 3 | // this work for additional information regarding copyright ownership.
 4 | // The ASF licenses this file to You under the Apache License, Version 2.0
 5 | // (the "License"); you may not use this file except in compliance with
 6 | // the License.  You may obtain a copy of the License at
 7 | //
 8 | //    http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | 
16 | resolvers ++= Seq(
17 |   "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
18 |   "Sonatype OSS Snapshots Repository" at "http://oss.sonatype.org/content/groups/public"
19 | )
20 | 
21 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "0.7.4")
22 | 
23 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2")
24 | 
25 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")
26 | 
27 | addSbtPlugin("org.scoverage" %% "sbt-scoverage" % "0.99.7.1")


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/Main.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook
17 | 
18 | import sun.misc.{Signal,SignalHandler}
19 | 
20 | import org.zeromq.ZMQ
21 | 
22 | import scalax.io.JavaConverters._
23 | import scalax.file.Path
24 | 
25 | import org.refptr.iscala._
26 | import json.JsonUtil._
27 | import msg._
28 | 
29 | object Main extends App {
30 |   val options = new Options(args)
31 | 
32 |   val thread = new Thread {
33 |     override def run() {
34 |       val iscala = new IScala(options.config){
35 |         override lazy val interpreter = new SparkNotebookInterpreter(classpath, options.config.args)
36 |       }
37 |       iscala.heartBeat.join()
38 |     }
39 |   }
40 | 
41 |   thread.setName("IScala")
42 |   thread.setDaemon(true)
43 |   thread.start()
44 |   thread.join()
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/ansible/sparknotebook.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: tag_Name_sparknotebook
 3 |   gather_facts: True
 4 |   user: ec2-user
 5 |   sudo: True
 6 |   tasks:
 7 |     - name: install spark jars
 8 |       script: spark-jars.sh
 9 |     - debug: var=script.stdout
10 |     - debug: var=script.stderr
11 |     - name: copy sparknotebook to init.d
12 |       copy: src=sparknotebook dest=/etc/init.d/sparknotebook
13 |     - name: copy local sparknotebook
14 |       copy: src=../target/universal/sparknotebook-0.1.0-SNAPSHOT.zip dest=/tmp/sparknotebook-0.1.0-SNAPTSHOT.zip
15 |       ignore_errors: yes
16 |     - name: ipython notebook and sparknotebook
17 |       script: sparknotebook.sh
18 |     - debug: var=script.stdout
19 |     - debug: var=script.stderr
20 |     - name: start sparknotebook
21 |       service: name=sparknotebook state=restarted enabled=yes
22 |     - debug: var=script.stdout
23 |     - debug: var=script.stderr
24 |     - name: install nginx
25 |       script: nginx.sh
26 |       tags:
27 |         - nginx
28 |     - debug: var=script.stdout
29 |     - debug: var=script.stderr
30 |     - name: copy nginx conf
31 |       copy: src=nginx.conf dest=/etc/nginx/nginx.conf
32 |       tags:
33 |         - nginx
34 |     - debug: var=script.stdout
35 |     - debug: var=script.stderr
36 |     - name:  nginx on startup 
37 |       service: name=nginx state=started enabled=on 
38 |       tags:
39 |         - nginx
40 |         - nginx-conf
41 |     - debug: var=script.stdout
42 |     - debug: var=script.stderr
43 | 
44 | 


--------------------------------------------------------------------------------
/sbt/sbt/bin/sbt.bat:
--------------------------------------------------------------------------------
 1 | @REM SBT launcher script
 2 | @REM 
 3 | @REM Envioronment:
 4 | @REM JAVA_HOME - location of a JDK home dir (mandatory)
 5 | @REM SBT_OPTS  - JVM options (optional)
 6 | @REM Configuration:
 7 | @REM sbtconfig.txt found in the SBT_HOME.
 8 | 
 9 | @REM   ZOMG! We need delayed expansion to build up CFG_OPTS later 
10 | @setlocal enabledelayedexpansion
11 | 
12 | @echo off
13 | set SBT_HOME=%~dp0
14 | 
15 | rem FIRST we load the config file of extra options.
16 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt
17 | set CFG_OPTS=
18 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
19 |   set DO_NOT_REUSE_ME=%%i
20 |   rem ZOMG (Part #2) WE use !! here to delay the expansion of
21 |   rem CFG_OPTS, otherwise it remains "" for this loop.
22 |   set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
23 | )
24 | 
25 | rem We use the value of the JAVACMD environment variable if defined
26 | set _JAVACMD=%JAVACMD%
27 | 
28 | if "%_JAVACMD%"=="" (
29 |   if not "%JAVA_HOME%"=="" (
30 |     if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
31 |   )
32 | )
33 | 
34 | if "%_JAVACMD%"=="" set _JAVACMD=java
35 | 
36 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
37 | set _JAVA_OPTS=%JAVA_OPTS%
38 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
39 | 
40 | :run
41 | 
42 | "%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %*
43 | if ERRORLEVEL 1 goto error
44 | goto end
45 | 
46 | :error
47 | @endlocal
48 | exit /B 1
49 | 
50 | 
51 | :end
52 | @endlocal
53 | exit /B 0
54 | 


--------------------------------------------------------------------------------
/src/universal/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # These variables are automatically filled in by the spark-ec2 script.
21 | export MASTERS="{{master_list}}"
22 | export SLAVES="{{slave_list}}"
23 | export HDFS_DATA_DIRS="{{hdfs_data_dirs}}"
24 | export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}"
25 | export SPARK_LOCAL_DIRS="{{spark_local_dirs}}"
26 | export MODULES="{{modules}}"
27 | export SPARK_VERSION="{{spark_version}}"
28 | export SHARK_VERSION="{{shark_version}}"
29 | export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
30 | export SWAP_MB="{{swap}}"
31 | export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
32 | export SPARK_MASTER_OPTS="{{spark_master_opts}}"
33 | export METASTORE_USER="{{metastore_user}}"
34 | export METASTORE_PASSWD="{{metastore_passwd}}"


--------------------------------------------------------------------------------
/aws.deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | create-profile(){
 3 | 	h=`aws iam list-roles|grep RoleName |grep dev-ops |cut -d '"' -f 4`
 4 | 	if [ -z $h ]; then
 5 | 	    echo "creating dev-ops role"
 6 | 	    aws iam create-instance-profile --instance-profile-name dev-ops
 7 | 	    aws iam create-role --role-name dev-ops --assume-role-policy-document '{             "Version": "2012-10-17",             "Statement": [                 {                     "Action": "sts:AssumeRole",                     "Principal": {                        "Service": "ec2.amazonaws.com"                     },                     "Effect": "Allow",                     "Sid": ""                 }             ]         }'
 8 | 	    aws iam add-role-to-instance-profile --instance-profile-name dev-ops --role-name dev-ops
 9 | 	    aws iam put-role-policy --role-name dev-ops --policy-name 'AllowEc2forDevOps' --policy-document '{       "Version": "2012-10-17",       "Statement": [         {           "Action": "ec2:*",           "Effect": "Allow",           "Resource": "*"         }       ]     }'
10 | 	    aws iam put-role-policy --role-name dev-ops --policy-name 'AllowS3forDevOps'  --policy-document '{       "Version": "2012-10-17",       "Statement": [         {           "Action": "s3:*",           "Effect": "Allow",           "Resource": "*"         }       ]     }'
11 | 	    aws iam put-role-policy --role-name dev-ops --policy-name 'AllowPassRoleforDevOps'  --policy-document '{       "Version": "2012-10-17",       "Statement": [         {           "Sid": "Stmt1409776891000",           "Effect": "Allow",           "Action": [             "iam:PassRole"           ],           "Resource": [             "*"           ]         }       ]     }'
12 | 	else 
13 | 	    echo "dev-ops role already exits"
14 | 	fi
15 | }
16 | create-profile
17 | cd /sparknotebook/ansible
18 | ansible-playbook -vvvv -i inventory/local --extra-vars "keypair=$AWS_KEY_PAIR" sparknotebook-prov.yml
19 | ansible-playbook -vvvv -i inventory/hosts --extra-vars "keypair=$AWS_KEY_PAIR" sparknotebook.yml


--------------------------------------------------------------------------------
/src/test/scala/eleflow/sparknotebook/BeforeAndAfterWithContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook
17 | 
18 | import org.apache.log4j.{Level, Logger}
19 | import org.apache.spark.SparkConf
20 | import org.scalatest.{BeforeAndAfterEach, Suite}
21 | 
22 | object TestSparkConf {
23 |   @transient lazy val conf = {
24 |     val sconf = new SparkConf()
25 |     sconf.set("spark.app.name", "teste")
26 |     sconf
27 |   }
28 | 
29 |   val separator =","
30 | 
31 | }
32 | 
33 | /**
34 |  * Created by dirceu on 22/10/14.
35 |  */
36 | trait BeforeAndAfterWithContext extends BeforeAndAfterEach {
37 |   this: Suite =>
38 | 
39 |   val defaultFilePath = "src/test/resources/"
40 |   import eleflow.sparknotebook.TestSparkConf._
41 |   ClusterSettings.master=Some("local[*]")
42 |   conf.set("spark.driver.allowMultipleContexts","true")
43 |   val context = new SparkNotebookContext(conf)
44 | 
45 |   override def beforeEach() = {
46 |     setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka"))
47 |   }
48 | 
49 |   def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = {
50 |     loggers.map {
51 |       loggerName =>
52 |         val logger = Logger.getLogger(loggerName)
53 |         val prevLevel = logger.getLevel()
54 |         logger.setLevel(level)
55 |         loggerName -> prevLevel
56 |     }.toMap
57 |   }
58 | 
59 |   override def afterEach() = {
60 |     context.clearContext
61 |     System.clearProperty("spark.master.port")
62 |   }
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/ansible/sparknotebook-prov.yml:
--------------------------------------------------------------------------------
 1 | - hosts: localhost
 2 |   connection: local
 3 |   gather_facts: False
 4 |   vars:
 5 |     keypair: "objeleflow"
 6 |     instance_type: "r3.xlarge"
 7 |     price: "0.15"
 8 |     image: "ami-d13845e1" 
 9 |     group: "SparkNotebookApplication"
10 |     region: "us-west-2"
11 |     zone: "us-west-2b"
12 |     iamrole: "dev-ops"
13 |   tasks:
14 |     - name: create sparknotebook security group
15 |       ec2_group:
16 |         name: "{{ group }}"
17 |         description: Security Group for the Web app
18 |         region: "{{ region }}"
19 |         purge_rules: false
20 |         purge_rules_egress: false
21 |         rules:
22 |           - proto: tcp
23 |             from_port: 80
24 |             to_port: 80
25 |             cidr_ip: 0.0.0.0/0
26 |           - proto: tcp
27 |             from_port: 22
28 |             to_port: 22
29 |             cidr_ip: 0.0.0.0/0
30 |           - proto: tcp
31 |             from_port: 4040
32 |             to_port: 4040
33 |             cidr_ip: 0.0.0.0/0
34 |           - proto: tcp
35 |             from_port: 8080
36 |             to_port: 8080
37 |             cidr_ip: 0.0.0.0/0
38 |         rules_egress:
39 |           - proto: all
40 |             cidr_ip: 0.0.0.0/0
41 | 
42 |     - name: create sparknotebook instance
43 |       ec2: image={{ image }}
44 |            instance_type={{ instance_type }}
45 |            keypair={{ keypair }}
46 |            instance_tags='{"Name":"sparknotebook"}'
47 |            instance_profile_name={{ iamrole }}
48 |            region={{ region }}
49 |            zone={{ zone }}
50 |            group={{ group }}
51 |            
52 |            wait=true
53 |            #spot_price={{price}} 
54 |       register: ec2_info
55 |     - debug: var=script.stdout
56 |     - debug: var=script.stderr
57 |     # # vpc_subnet_id=subnet-e32aff86
58 |     # assign_public_ip=yes
59 |     # - add_host: hostname={{ item.public_ip }} groupname=ec2hosts
60 |     #   with_items: ec2_info.instances
61 |     - name: wait for instances to listen on port:22
62 |       wait_for:
63 |         state=started
64 |         host={{ item.public_dns_name }}
65 |         port=22
66 |         timeout=600
67 |       with_items: ec2_info.instances
68 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/visualization/RichDisplay.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook.visualization
17 | 
18 | import eleflow.sparknotebook.data.{FileDataset, Dataset}
19 | import org.apache.spark.sql.SchemaRDD
20 | import org.refptr.iscala.display.HTMLDisplay
21 | 
22 | import scalatags.Text.TypedTag
23 | import scalatags.Text.all._
24 | import Dataset._
25 | 
26 | 
27 | /**
28 |  * SparkNotebook
29 |  * Copyright (C) 2014 eleflow.
30 |  * User: paulomagalhaes
31 |  * Date: 10/27/14 10:24 AM
32 |  */
33 | object RichDisplay {
34 | 
35 | 
36 | 
37 |   implicit val HTMLTypedTag = HTMLDisplay[TypedTag[String]](_.toString)
38 |   implicit val HTMLSchemaRdd = HTMLDisplay[SchemaRDD, TypedTag[String]] { rdd:SchemaRDD =>
39 | 
40 |     div(style:="overflow:scroll", table(
41 |       tr(
42 |         rdd.schema.fieldNames.map(column=>th(column))
43 |       ),
44 |       rdd.take(7).map(row=>
45 |         tr(row.map(field=> td(String.valueOf(field)))
46 |         ))
47 |     ))
48 |   }
49 | 
50 | 
51 |   implicit val HTMLSeqAny = HTMLDisplay[Seq[Any], TypedTag[String]] { seq =>
52 |     div(style:="overflow:scroll", table(
53 |       seq.zipWithIndex.map(row=>
54 |         tr(th(row._2), td(String.valueOf(row._1))))
55 |     ))
56 |   }
57 | 
58 |   implicit val HTMLSeqTuples = HTMLDisplay[Seq[Product], TypedTag[String]] { seq =>
59 |     div(style:="overflow:scroll", table(
60 |       seq.zipWithIndex.map(row=>
61 |         tr(th(row._2), row._1.productIterator.toList.map(field=> td(String.valueOf(field)))
62 |         ))
63 |     ))
64 |   }
65 | 
66 | 
67 |   implicit val HTMLMapAny = HTMLDisplay[Map[Any, Any], TypedTag[String]] { aMap =>
68 | 
69 |     div(style:="overflow:scroll", table(
70 |       aMap.map(entry=>
71 |         tr(td(String.valueOf(entry._1)), td(String.valueOf(entry._2))
72 |         )).toSeq
73 |     ))
74 |   }
75 | 
76 |   implicit val HTMLArrayTuples = HTMLDisplay[Array[Product], Seq[Product]] { array =>
77 |     array.toSeq
78 |   }
79 | 
80 |   implicit val HTMLDataset = HTMLDisplay[Dataset, SchemaRDD] { dataset =>
81 |     dataset
82 |   }
83 | 
84 |   implicit val HTMLFileDataset = HTMLDisplay[FileDataset, SchemaRDD] { dataset =>
85 |     dataset
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/ansible/sparknotebook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | version="0.1.0"
 3 | yum -y update
 4 | 
 5 | yum -y groupinstall "Development Tools"
 6 | 
 7 | #install python 2.7
 8 | yum -y install python27-devel.x86_64
 9 | wget https://bootstrap.pypa.io/get-pip.py
10 | python27 ./get-pip.py 
11 | # install zeroMQ
12 | wget http://download.zeromq.org/zeromq-4.0.4.tar.gz
13 | tar xzvf zeromq-4.0.4.tar.gz
14 | cd zeromq-4.0.4
15 | ./config
16 | make install
17 | #install ipython
18 | /usr/local/bin/pip2.7 install "ipython[all]"==2.4
19 | 
20 | 
21 | createSparknotebookprofile(){
22 |     sudo -u sparknotebook /usr/local/bin/ipython profile create sparknotebook
23 |     rm -f ~sparknotebook/.ipython/profile_sparknotebook/ipython_config.py
24 |     sudo -u sparknotebook cat << EOF >> ~sparknotebook/.ipython/profile_sparknotebook/ipython_config.py
25 | # Configuration file for ipython.
26 | 
27 | c = get_config()
28 | 
29 | c.KernelManager.kernel_cmd = ["/usr/share/sparknotebook/bin/sparknotebook", 
30 | #"-mem","28000",
31 |  "--profile", "{connection_file}",
32 |  "--parent"]
33 | c.NotebookApp.ip = "*" # only add this line if you want IPython-notebook being open to the public
34 | c.NotebookApp.open_browser = False # only add this line if you want to suppress opening a browser after IPython-notebook initialization
35 | EOF
36 | }
37 | 
38 | 
39 | # Adding system user/group : sparknotebook and sparknotebook
40 | if ! getent group | grep -q "^sparknotebook:" ;
41 | then
42 |     echo "Creating system group: sparknotebook"
43 |     groupadd sparknotebook
44 | fi
45 | if ! getent passwd | grep -q "^sparknotebook:";
46 | then
47 |     echo "Creating system user: sparknotebook"
48 |     useradd --gid sparknotebook --create-home --comment "SparkNotebook Interactive User" sparknotebook
49 | fi
50 | 
51 | # install Spark Notebook
52 | file="/tmp/sparknotebook-$version.zip"
53 | if ! test -f "$file" 
54 | then
55 | 	aws s3 cp s3://sparknotebook-public/sparknotebook/sparknotebook-$version.zip /tmp/
56 | fi
57 | 
58 | unzip -o /tmp/sparknotebook-$version.zip -d /usr/share
59 | rm -f /tmp/sparknotebook-$version.zip
60 | rm -f /usr/share/sparknotebook
61 | ln -s /usr/share/sparknotebook-$version /usr/share/sparknotebook
62 | 
63 | chown -R sparknotebook:sparknotebook  /usr/share/sparknotebook-$version
64 | chown  sparknotebook:sparknotebook  /usr/share/sparknotebook
65 | 
66 | #install ipython init.d scripts
67 | mkdir -p /files
68 | chown sparknotebook:sparknotebook /files
69 | mkdir -p /var/log/sparknotebook
70 | chown sparknotebook:sparknotebook /var/log/sparknotebook
71 | mkdir -p /etc/default/sparknotebook
72 | chown sparknotebook:sparknotebook /etc/default/sparknotebook
73 | mkdir -p /var/run/sparknotebook
74 | chown sparknotebook:sparknotebook /var/run/sparknotebook
75 | sudo -u sparknotebook ipython profile create sparknotebook
76 | 
77 | createSparknotebookprofile
78 | chmod +x /etc/init.d/sparknotebook
79 | 


--------------------------------------------------------------------------------
/ansible/inventory/ec2.ini:
--------------------------------------------------------------------------------
 1 | # Ansible EC2 external inventory script settings
 2 | #
 3 | 
 4 | [ec2]
 5 | 
 6 | # to talk to a private eucalyptus instance uncomment these lines
 7 | # and edit edit eucalyptus_host to be the host name of your cloud controller
 8 | #eucalyptus = True
 9 | #eucalyptus_host = clc.cloud.domain.org
10 | 
11 | # AWS regions to make calls to. Set this to 'all' to make request to all regions
12 | # in AWS and merge the results together. Alternatively, set this to a comma
13 | # separated list of regions. E.g. 'us-east-1,us-west-1,us-west-2'
14 | regions = us-west-2,sa-east-1
15 | regions_exclude = us-gov-west-1,cn-north-1
16 | 
17 | # When generating inventory, Ansible needs to know how to address a server.
18 | # Each EC2 instance has a lot of variables associated with it. Here is the list:
19 | #   http://docs.pythonboto.org/en/latest/ref/ec2.html#module-boto.ec2.instance
20 | # Below are 2 variables that are used as the address of a server:
21 | #   - destination_variable
22 | #   - vpc_destination_variable
23 | 
24 | # This is the normal destination variable to use. If you are running Ansible
25 | # from outside EC2, then 'public_dns_name' makes the most sense. If you are
26 | # running Ansible from within EC2, then perhaps you want to use the internal
27 | # address, and should set this to 'private_dns_name'.
28 | destination_variable = public_dns_name
29 | 
30 | # For server inside a VPC, using DNS names may not make sense. When an instance
31 | # has 'subnet_id' set, this variable is used. If the subnet is public, setting
32 | # this to 'ip_address' will return the public IP address. For instances in a
33 | # private subnet, this should be set to 'private_ip_address', and Ansible must
34 | # be run from with EC2.
35 | vpc_destination_variable = ip_address
36 | 
37 | # To tag instances on EC2 with the resource records that point to them from
38 | # Route53, uncomment and set 'route53' to True.
39 | route53 = False
40 | 
41 | # Additionally, you can specify the list of zones to exclude looking up in
42 | # 'route53_excluded_zones' as a comma-separated list.
43 | # route53_excluded_zones = samplezone1.com, samplezone2.com
44 | 
45 | # By default, only EC2 instances in the 'running' state are returned. Set
46 | # 'all_instances' to True to return all instances regardless of state.
47 | all_instances = False
48 | 
49 | # By default, only RDS instances in the 'available' state are returned.  Set
50 | # 'all_rds_instances' to True return all RDS instances regardless of state.
51 | all_rds_instances = False
52 | 
53 | # API calls to EC2 are slow. For this reason, we cache the results of an API
54 | # call. Set this to the path you want cache files to be written to. Two files
55 | # will be written to this directory:
56 | #   - ansible-ec2.cache
57 | #   - ansible-ec2.index
58 | cache_path = ~/.ansible/tmp
59 | 
60 | # The number of seconds a cache file is considered valid. After this many
61 | # seconds, a new API call will be made, and the cache file will be updated.
62 | # To disable the cache, set this value to 0
63 | cache_max_age = 300
64 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/SparkNotebookInterpreter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook
17 | 
18 | import org.apache.spark.SparkConf
19 | import org.apache.spark.repl.SparkILoop
20 | 
21 | import scala.collection.immutable
22 | import scala.tools.nsc.interpreter.{IR, NamedParam}
23 | import org.apache.spark.repl.SparkILoop
24 | import org.apache.spark.{SparkConf}
25 | import org.refptr.iscala.{Results, Interpreter}
26 | 
27 | 
28 | class SparkNotebookInterpreter(classpath: String, args: Seq[String], usejavacp: Boolean=true) extends Interpreter(classpath, args, false, usejavacp) {
29 | 
30 |   var snc: SparkNotebookContext = _
31 | 
32 |   override  def initializeSpark() {
33 |     snc = createContext()
34 | 
35 |     val namedParam = NamedParam[SparkNotebookContext]("snc", snc)
36 |     intp.beQuietDuring(bind(namedParam.name, namedParam.tpe, namedParam.value, immutable.List("@transient"))) match {
37 |       case IR.Success => Unit
38 |       case _ => throw new RuntimeException("Spark failed to initialize")
39 |     }
40 | 
41 |     val importSVresult = interpret( """
42 | import org.apache.spark.SparkContext._
43 | import eleflow.sparknotebook._
44 | import eleflow.sparknotebook.visualization.RichDisplay._
45 | import snc._
46 | import eleflow.sparknotebook.data.Dataset._
47 |                                     """)
48 |     importSVresult match {
49 |       case Results.Value(value, tpe, repr) => Unit
50 |       case Results.NoValue => Unit
51 |       case Results.Exception(_,_,_,ee) => throw new RuntimeException("SparkContext failed to be imported", ee)
52 |       case _ => throw new RuntimeException("SparkContext failed to be imported")
53 |     }
54 | 
55 |   }
56 | 
57 |   override  def sparkCleanUp() {
58 |     if (snc!=null) {
59 |       snc.clearContext
60 |     }
61 |   }
62 | 
63 |   override lazy val appName: String = "SparkNotebook"
64 | 
65 |   def createContext(): SparkNotebookContext = {
66 |     val execUri = System.getenv("SPARK_EXECUTOR_URI")
67 |     val jars = SparkILoop.getAddedJars
68 |     val conf = new SparkConf()
69 |       .setMaster(getMaster())
70 |       .setAppName(this.appName)
71 |       .setJars(jars)
72 |       .set("spark.repl.class.uri", intp.classServer.uri) //very important! spark treat REPL very differently
73 |     .set("spark.files.overwrite","true")
74 |     if (execUri != null) {
75 |       conf.set("spark.executor.uri", execUri)
76 |     }
77 |     if (System.getenv("SPARK_HOME") != null) {
78 |       conf.setSparkHome(System.getenv("SPARK_HOME"))
79 |     }
80 |     new SparkNotebookContext(conf)
81 |   }
82 | 
83 |   protected def getMaster(): String = {
84 |     val master = {
85 |       val envMaster = sys.env.get("MASTER")
86 |       val propMaster = sys.props.get("spark.master")
87 |       propMaster.orElse(envMaster).getOrElse("local[*]")
88 |     }
89 |     master
90 |   }
91 | 
92 | }


--------------------------------------------------------------------------------
/ansible/sparknotebook:
--------------------------------------------------------------------------------
  1 | #!/bin/sh 
  2 | # 
  3 | # sparknotebook <sparknotebook> \
  4 | #
  5 | # chkconfig: 2345 20 80
  6 | # description: Spark Notebook Interactive
  7 | #
  8 | 
  9 | ### BEGIN INIT INFO
 10 | # Provides: sparknotebook
 11 | # Required-Start: $remote_fs $syslog
 12 | # Required-Stop: $remote_fs $syslog
 13 | # Default-Start: 2 3 4 5
 14 | # Default-Stop: 0 1 6
 15 | # Should-Start:
 16 | # Should-Stop:
 17 | # Short-Description: Spark Notebook Interactive
 18 | # Description: Spark Notebook Interactive
 19 | ### END INIT INFO
 20 | 
 21 | ### -----------------
 22 | # This script was created using following sources
 23 | # 
 24 | # http://stackoverflow.com/questions/8124345/call-to-daemon-in-a-etc-init-d-script-is-blocking-not-running-in-background
 25 | # https://fedoraproject.org/wiki/Packaging:SysVInitScript#Initscript_template
 26 | ### -----------------
 27 | 
 28 | # Source function library.
 29 | . /etc/rc.d/init.d/functions
 30 | 
 31 | prog="sparknotebook"
 32 | 
 33 | # FIXME The pid file should be handled by the executed script
 34 | # The pid can be filled in in this script
 35 | PIDFILE=/var/run/sparknotebook/running.pid
 36 | 
 37 | if [ -z "$DAEMON_USER" ]; then
 38 |     DAEMON_USER=sparknotebook
 39 | fi
 40 | 
 41 | 
 42 | # smb could define some additional options in $RUN_OPTS
 43 | RUN_CMD="ipython notebook --profile sparknotebook"
 44 | 
 45 | [ -e /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog
 46 | 
 47 | lockfile=/var/lock/subsys/$prog
 48 | 
 49 | start() {
 50 |     echo -n $"Starting $prog: "
 51 | 
 52 |     nohup runuser -l $DAEMON_USER -c "${RUN_CMD}" >> /var/log/sparknotebook/daemon.log 2>&1 &
 53 | 
 54 |     # The way to go, but doesn't work properly
 55 |     # If the app creates the pid file this gets messy
 56 |     # daemon --user $DAEMON_USER --pidfile $PIDFILE $RUN_CMD &
 57 |     
 58 |     
 59 |     retval=$?   # last error code
 60 |     PID=$!      # pid of last backgrounded process
 61 |     [ $retval -eq 0 ] && touch ${lockfile} && success || failure
 62 |     
 63 |     # Insert pid into pid file for CentOS killproc function
 64 |     [ -d "/var/run/sparknotebook" ] || install -d -o "$DAEMON_USER" -m750 "/var/run/sparknotebook"
 65 |     echo
 66 |     echo $PID > ${PIDFILE}
 67 |     return $retval
 68 | }
 69 | 
 70 | stop() {
 71 |     echo -n $"Stopping $prog: "
 72 |     killproc -p $PIDFILE $prog
 73 |     retval=$?
 74 |     [ $retval -eq 0 ] && rm -f $lockfile
 75 |     return $retval
 76 | }
 77 | 
 78 | restart() {
 79 |     stop
 80 |     start
 81 | }
 82 | 
 83 | reload() {
 84 |     restart
 85 | }
 86 | 
 87 | force_reload() {
 88 |     restart
 89 | }
 90 | 
 91 | rh_status() {
 92 |     # run checks to determine if the service is running or use generic status
 93 |     status -p $PIDFILE -l $lockfile $prog
 94 | }
 95 | 
 96 | rh_status_q() {
 97 |     rh_status >/dev/null 2>&1
 98 | }
 99 | 
100 | 
101 | case "$1" in
102 |     start)
103 |         rh_status_q && exit 0
104 |         $1
105 |         ;;
106 |     stop)
107 |         rh_status_q || exit 0
108 |         $1
109 |         ;;
110 |     restart)
111 |         $1
112 |         ;;
113 |     reload)
114 |         rh_status || exit 7
115 |         $1
116 |         ;;
117 |     force-reload)
118 |         force_reload
119 |         ;;
120 |     status)
121 |         rh_status
122 |         ;;
123 |     condrestart|try-restart)
124 |         rh_status || exit 0
125 |         restart
126 |         ;;
127 |     *)
128 |         echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}"
129 |         exit 2
130 | esac
131 | exit $?
132 | 


--------------------------------------------------------------------------------
/src/test/scala/eleflow/sparknotebook/FuncTestSparkNotebookContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2015 eleflow.com.br.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | * http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package eleflow.sparknotebook
17 | 
18 | import eleflow.sparknotebook.data.{DataTransformer, Dataset}
19 | import eleflow.sparknotebook.enums.DataSetType
20 | import org.apache.spark.SparkException
21 | import org.scalatest._
22 | import org.scalatest.mock.MockitoSugar
23 | 
24 | /**
25 |  * Created by dirceu on 14/10/14.
26 |  */
27 | class FuncTestSparkNotebookContext extends FlatSpec with Matchers with MockitoSugar with BeforeAndAfterWithContext {
28 |   this: Suite =>
29 | 
30 |   val uberContext = context
31 | 
32 |   "Functional SparkNotebookContext" should
33 |     "correctly load rdd" in {
34 | 
35 |     import eleflow.sparknotebook.data.Dataset._
36 | 
37 |     val dataset = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv")
38 | 
39 |     val testDataSet = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile2.csv")
40 | 
41 |     val unionDataset= DataTransformer.extractStringsFromTrainTestSchema(dataset.toSchemaRDD, testDataSet.toSchemaRDD,
42 |       Seq(0))
43 |       val normalized = unionDataset.summarizedColumns.map {
44 |       case (colIndex, (size, funcIndex, funcValue)) => (colIndex + 1, (size, funcIndex, funcValue))
45 |     }
46 |     val result = DataTransformer.createLabeledPointFromRDD(dataset, Seq(0), normalized,DataSetType.Train,unionDataset.columnsSize)
47 |     val all = result.take(3)
48 |     val (_, first) = all.head
49 |     val (_, second) = all.tail.head
50 |     assert(first.label == 1)
51 |     assert(first.features.toArray.deep == Array[Double](5.0, 0.0, 1.0, 10.5).deep)
52 |     assert(second.label == 2)
53 |     assert(second.features.toArray.deep == Array[Double](1.0, 1.0, 0.0, 0.1).deep)
54 |     uberContext.clearContext
55 |   }
56 | 
57 |   it should "Throw an exception when process an empty numeric column" in {
58 | 
59 |     @transient lazy val context = uberContext
60 | 
61 |     val sc = context.sparkContext
62 |     try {
63 |       import eleflow.sparknotebook.data.Dataset._
64 |       val dataset = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv")
65 |       dataset.take(3)
66 |     } catch {
67 |       case e: SparkException => {
68 |         assert(e.getMessage.contains("UnexpectedFileFormatException"))
69 |       }
70 |     }
71 |   }
72 | 
73 |   it should "Correct handle empty string values" in {
74 |     @transient lazy val context = uberContext
75 |     val sc = context.sparkContext
76 |     val schemaRdd = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextEmpty.csv").schemaRDD
77 |     val result = DataTransformer.createLabeledPointFromRDD(schemaRdd, Seq(0),DataSetType.Train)
78 | 
79 |   }
80 | 
81 |   it should "Throw an exception when input have different number of columns" in {
82 |     val sc = uberContext.sparkContext()
83 |     try {
84 | 
85 |       val result = context.load(s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv", TestSparkConf.separator)
86 |     } catch {
87 |       case e: SparkException =>
88 |         assert(e.getMessage.contains("UnexpectedFileFormatException"))
89 |     }
90 |   }
91 | 
92 | }
93 | 


--------------------------------------------------------------------------------
/ansible/nginx.conf:
--------------------------------------------------------------------------------
  1 | # For more information on configuration, see:
  2 | #   * Official English Documentation: http://nginx.org/en/docs/
  3 | #   * Official Russian Documentation: http://nginx.org/ru/docs/
  4 | 
  5 | user  nginx;
  6 | worker_processes  1;
  7 | 
  8 | error_log  /var/log/nginx/error.log;
  9 | #error_log  /var/log/nginx/error.log  notice;
 10 | #error_log  /var/log/nginx/error.log  info;
 11 | 
 12 | pid        /var/run/nginx.pid;
 13 | 
 14 | 
 15 | events {
 16 |     worker_connections  1024;
 17 | }
 18 | 
 19 | 
 20 | http {
 21 |     include       /etc/nginx/mime.types;
 22 |     default_type  application/octet-stream;
 23 | 
 24 |     log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
 25 |                       '$status $body_bytes_sent "$http_referer" '
 26 |                       '"$http_user_agent" "$http_x_forwarded_for"';
 27 | 
 28 |     access_log  /var/log/nginx/access.log  main;
 29 | 
 30 |     sendfile        on;
 31 |     #tcp_nopush     on;
 32 | 
 33 |     #keepalive_timeout  0;
 34 |     keepalive_timeout  65;
 35 | 
 36 |     #gzip  on;
 37 | 
 38 |     # Load modular configuration files from the /etc/nginx/conf.d directory.
 39 |     # See http://nginx.org/en/docs/ngx_core_module.html#include
 40 |     # for more information.
 41 |     include /etc/nginx/conf.d/*.conf;
 42 | 
 43 |     index   index.html index.htm;
 44 | 
 45 |     server {
 46 |         listen       80;
 47 |         server_name  localhost;
 48 | 
 49 |         #charset koi8-r;
 50 | 
 51 |         #access_log  /var/log/nginx/host.access.log  main;
 52 | 
 53 |         location / {
 54 |             proxy_pass http://localhost:8888;
 55 |             proxy_http_version 1.1;
 56 |             proxy_set_header Upgrade $http_upgrade;
 57 |             proxy_set_header Connection "upgrade";
 58 |             proxy_set_header Origin "";
 59 |         }
 60 | 
 61 |         location /files/ {
 62 |             root /;
 63 |             autoindex on;
 64 |         }
 65 | 
 66 |         # redirect server error pages to the static page /40x.html
 67 |         #
 68 |         error_page  404              /404.html;
 69 |         location = /40x.html {
 70 |         }
 71 | 
 72 |         # redirect server error pages to the static page /50x.html
 73 |         #
 74 |         error_page   500 502 503 504  /50x.html;
 75 |         location = /50x.html {
 76 |         }
 77 | 
 78 |         # proxy the PHP scripts to Apache listening on 127.0.0.1:80
 79 |         #
 80 |         #location ~ \.php$ {
 81 |         #    proxy_pass   http://127.0.0.1;
 82 |         #}
 83 | 
 84 |         # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
 85 |         #
 86 |         #location ~ \.php$ {
 87 |         #    root           html;
 88 |         #    fastcgi_pass   127.0.0.1:9000;
 89 |         #    fastcgi_index  index.php;
 90 |         #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
 91 |         #    include        fastcgi_params;
 92 |         #}
 93 | 
 94 |         # deny access to .htaccess files, if Apache's document root
 95 |         # concurs with nginx's one
 96 |         #
 97 |         #location ~ /\.ht {
 98 |         #    deny  all;
 99 |         #}
100 |     }
101 | 
102 | 
103 |     # another virtual host using mix of IP-, name-, and port-based configuration
104 |     #
105 |     #server {
106 |     #    listen       8000;
107 |     #    listen       somename:8080;
108 |     #    server_name  somename  alias  another.alias;
109 |     #    root         html;
110 | 
111 |     #    location / {
112 |     #    }
113 |     #}
114 | 
115 | 
116 |     # HTTPS server
117 |     #
118 |     #server {
119 |     #    listen       443;
120 |     #    server_name  localhost;
121 |     #    root         html;
122 | 
123 |     #    ssl                  on;
124 |     #    ssl_certificate      cert.pem;
125 |     #    ssl_certificate_key  cert.key;
126 | 
127 |     #    ssl_session_timeout  5m;
128 | 
129 |     #    ssl_protocols  SSLv2 SSLv3 TLSv1;
130 |     #    ssl_ciphers  HIGH:!aNULL:!MD5;
131 |     #    ssl_prefer_server_ciphers   on;
132 | 
133 |     #    location / {
134 |     #    }
135 |     #}
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Notebook
 2 | 
 3 | [![Build Status][build-badge]][build-url]
 4 | [![License][license-badge]][license-url]
 5 | 
 6 | The Spark Notebook project is fast way of getting a [Spark](http://spark.apache.org/) cluster up and running on [AWS](http://aws.amazon.com) with the friendly [IPython](http://ipython.org) interface.
 7 | 
 8 | ## Before you start
 9 | You'll need 
10 | 
11 | 1. to have [Docker installed](https://docs.docker.com/installation/) (recommended) or [no docker setup](nodocker.md)
12 | 1. [AWS access keys](http://aws.amazon.com/developers/access-keys) 
13 | 1. One [AWS keypair](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair)
14 | 
15 |  
16 | ## Setup
17 | 1. git clone https://github.com/eleflow/sparknotebook.git
18 | 1. cd sparknotebook
19 | 1. create a aws.deploy.env file with these:
20 | 
21 |   ```sh
22 |   AWS_ACCESS_KEY_ID=<YOUR AWS ACCESS KEY>
23 |   AWS_SECRET_ACCESS_KEY=<YOUR AWS SEECRET ACCESS KEY>
24 |   AWS_KEY_PAIR=<YOUR AWS KEY PAIR NAME>
25 |   ```
26 | 1. Run 
27 | 
28 | ``` $ docker build --rm -f=aws.deploy.Dockerfile -t=aws.deploy .```
29 | 
30 | ## Running the Notebook on AWS
31 | 
32 | 1. Run `sudo docker run -it --env-file ./aws.deploy.env --volume $PWD:/sparknotebook --volume $HOME/.ssh:/.ssh aws.deploy` and if all goes well you will see the ip of your sparknotebook server in a line like this
33 |   ```sh 
34 |   ...
35 | 
36 |   PLAY RECAP ******************************************************************** 
37 |   52.10.183.42               : ok=21   changed=3    unreachable=0    failed=0   
38 |   ```
39 | 1. Where 52.10.183.42 will be replaced with another ip address. Put that ip address on your browser to get access to the notebook
40 | 
41 | ## Spark Notebook
42 | 
43 | Spark Notebook kernel is deployed into your server, and you can access it through the port 80, using an HTTP browser.
44 | The initial notebook state is showed in the picture below:
45 | 
46 | ![Alt text](/../images/images/EmptyNotebook.png?raw=true "Initial state of a Spark Notebook")
47 | 
48 | To start a new notebook, just click in the New Notebook button, and you will be redirected to a new tab, containing an empty notebook.
49 | The notebook is a code container that contains multiple TextArea components, where you can insert any kind of Scala code, including multi lines scripts. To execute the desired code, put the focus into the code TextArea component and hit Shift + ENTER or click in the play button (positioned at the notebook Header). Each time that you submit a code to the notebook, it will be compiled and if it compiles, it will be executed.
50 | 
51 | ## Cluster Settings
52 | 
53 | One of the cluster settings you are likely to change is the number of slaves. To change it to 30, you can run this code on the Spark Noteook
54 | ```scala
55 |   ClusterSettings.coreInstanceCount = 30 // Number of workers available in your cluster - default to 3
56 | ```
57 | To see other settings see  [ClusterSettings](src/main/scala/eleflow/sparknotebook/SparkNotebookContext.scala)
58 | ## SparkContext
59 | A SparkContext can be accessed with:
60 | ```scala
61 |   sparkContext
62 | ```
63 | This is a method of SparkNotebookContext and it provisions the machines and sets up the cluster the first time it runs. An example of output of this method is showed below:
64 | 
65 | ![Alt text](/../images/images/ClusterInstantiation.png?raw=true "Sample output of a cluster instantiation")
66 | 
67 | ## Shutdown
68 | 
69 | To shutdown the cluster and terminate the cluster master and slaves run:
70 | ```scala
71 |     terminate
72 | ```
73 | 
74 | ## Monitoring
75 | ### Ganglia
76 | 
77 | The master instance of your cluster also has a monitoring tool named Ganglia installed and it's address is displayed when you create the SparkContext.
78 | Ganglia is a useful tool that help you to monitor the CPU, memory and disk usage, displaying graphs of this components. JVM data like, gc executions. It's very useful to help you to setup the correct cluster size, for your tasks.
79 | The ganglia address is printed in the screen during the cluster instantiation. It's always deployed to the masterhost:5080/ganglia address.
80 | It's important to note that the information showed at ganglia has a little delay.
81 | 
82 | # Local build
83 | 
84 | To build and run locally go [here](buildlocal.md)
85 | 
86 | # License
87 | 
88 | This project is distributed under Apache License Version 2.0
89 | 
90 | [build-badge]: https://travis-ci.org/eleflow/sparknotebook.svg?branch=master
91 | [build-url]: https://travis-ci.org/eleflow/sparknotebook
92 | [license-badge]: https://img.shields.io/badge/License-Apache%202-blue.svg?style=flat
93 | [license-url]: LICENSE
94 | 


--------------------------------------------------------------------------------
/sbt/sbt/bin/sbt:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | 
  4 | ###  ------------------------------- ###
  5 | ###  Helper methods for BASH scripts ###
  6 | ###  ------------------------------- ###
  7 | 
  8 | realpath () {
  9 | (
 10 |   TARGET_FILE="$1"
 11 |   FIX_CYGPATH="$2"
 12 | 
 13 |   cd "$(dirname "$TARGET_FILE")"
 14 |   TARGET_FILE=$(basename "$TARGET_FILE")
 15 | 
 16 |   COUNT=0
 17 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 18 |   do
 19 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 20 |       cd "$(dirname "$TARGET_FILE")"
 21 |       TARGET_FILE=$(basename "$TARGET_FILE")
 22 |       COUNT=$(($COUNT + 1))
 23 |   done
 24 | 
 25 |   # make sure we grab the actual windows path, instead of cygwin's path.
 26 |   if [[ "x$FIX_CYGPATH" != "x" ]]; then
 27 |     echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")"
 28 |   else
 29 |     echo "$(pwd -P)/$TARGET_FILE"
 30 |   fi
 31 | )
 32 | }
 33 | 
 34 | 
 35 | # Uses uname to detect if we're in the odd cygwin environment.
 36 | is_cygwin() {
 37 |   local os=$(uname -s)
 38 |   case "$os" in
 39 |     CYGWIN*) return 0 ;;
 40 |     *)  return 1 ;;
 41 |   esac
 42 | }
 43 | 
 44 | # TODO - Use nicer bash-isms here.
 45 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi)
 46 | 
 47 | 
 48 | # This can fix cygwin style /cygdrive paths so we get the
 49 | # windows style paths.
 50 | cygwinpath() {
 51 |   local file="$1"
 52 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
 53 |     echo $(cygpath -w $file)
 54 |   else
 55 |     echo $file
 56 |   fi
 57 | }
 58 | 
 59 | . "$(dirname "$(realpath "$0")")/sbt-launch-lib.bash"
 60 | 
 61 | 
 62 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
 63 | declare -r sbt_opts_file=".sbtopts"
 64 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts"
 65 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt"
 66 | 
 67 | usage() {
 68 |  cat <<EOM
 69 | Usage: $script_name [options]
 70 | 
 71 |   -h | -help         print this message
 72 |   -v | -verbose      this runner is chattier
 73 |   -d | -debug        set sbt log level to debug
 74 |   -no-colors         disable ANSI color codes
 75 |   -sbt-create        start sbt even if current directory contains no sbt project
 76 |   -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
 77 |   -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
 78 |   -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
 79 |   -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
 80 |   -no-share          use all local caches; no sharing
 81 |   -no-global         uses global caches, but does not use global ~/.sbt directory.
 82 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
 83 |   -batch             Disable interactive mode
 84 | 
 85 |   # sbt version (default: from project/build.properties if present, else latest release)
 86 |   -sbt-version  <version>   use the specified version of sbt
 87 |   -sbt-jar      <path>      use the specified jar as the sbt launcher
 88 |   -sbt-rc                   use an RC version of sbt
 89 |   -sbt-snapshot             use a snapshot version of sbt
 90 | 
 91 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
 92 |   -java-home <path>         alternate JAVA_HOME
 93 | 
 94 |   # jvm options and output control
 95 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
 96 |   SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
 97 |   .sbtopts           if this file exists in the current directory, it is
 98 |                      prepended to the runner args
 99 |   /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
100 |   -Dkey=val          pass -Dkey=val directly to the java runtime
101 |   -J-X               pass option -X directly to the java runtime 
102 |                      (-J is stripped)
103 |   -S-X               add -X to sbt's scalacOptions (-S is stripped)
104 | 
105 | In the case of duplicated or conflicting options, the order above
106 | shows precedence: JAVA_OPTS lowest, command line options highest.
107 | EOM
108 | }
109 | 
110 | 
111 | 
112 | process_my_args () {
113 |   while [[ $# -gt 0 ]]; do
114 |     case "$1" in
115 |      -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
116 |       -no-share) addJava "$noshare_opts" && shift ;;
117 |      -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
118 |       -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
119 |        -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
120 |      -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
121 |          -batch) exec </dev/null && shift ;;
122 | 
123 |     -sbt-create) sbt_create=true && shift ;;
124 | 
125 |               *) addResidual "$1" && shift ;;
126 |     esac
127 |   done
128 |   
129 |   # Now, ensure sbt version is used.
130 |   [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version" 
131 | }
132 | 
133 | loadConfigFile() {
134 |   cat "$1" | sed '/^\#/d' | while read line; do
135 |     eval echo $line
136 |   done
137 | }
138 | 
139 | # TODO - Pull in config based on operating system... (MSYS + cygwin should pull in txt file).
140 | # Here we pull in the global settings configuration.
141 | [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
142 | # -- Windows behavior stub'd
143 | # JAVA_OPTS=$(cat "$WDIR/sbtconfig.txt" | sed -e 's/\r//g' -e 's/^#.*$//g' | sed ':a;N;$!ba;s/\n/ /g')
144 | 
145 | 
146 | #  Pull in the project-level config file, if it exists.
147 | [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
148 | 
149 | 
150 | run "$@"
151 | 
152 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/data/DataTransformer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2015 eleflow.com.br.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | * http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | */
 16 | package eleflow.sparknotebook.data
 17 | 
 18 | import java.sql.Timestamp
 19 | 
 20 | import com.gensler.scalavro.util.Union
 21 | import eleflow.sparknotebook.enums.DataSetType
 22 | import eleflow.sparknotebook.exception.UnexpectedValueException
 23 | import eleflow.sparknotebook.data.Dataset._
 24 | import eleflow.sparknotebook.util.IntStringImplicitTypeConverter.IS
 25 | import org.apache.spark.SparkContext._
 26 | import org.apache.spark.mllib.linalg.Vectors
 27 | import org.apache.spark.mllib.regression.LabeledPoint
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.spark.sql.{SchemaRDD, _}
 30 | 
 31 | 
 32 | /**
 33 |  * Created by dirceu on 16/10/14.
 34 |  */
 35 | 
 36 | object DataTransformer {
 37 | 
 38 |   def createLabeledPointsFromRDDs(train: Dataset, test:Dataset, target: String, id: String ) = {
 39 | 
 40 |     val summarizedColumns = train.sliceByName(excludes = Seq(target)).unionAll(test).sliceByName(excludes = Seq(id)).summarizedColumns.map (t  => (t._1 + 1, t._2 ))
 41 |     val columnsSize = summarizedColumns.map(_._2._1).sum().toInt
 42 |     val targetIndex=train.columnIndexOf(target)
 43 |     val idIndex = test.columnIndexOf(id)
 44 |     (createLabeledPointFromRDD(train.sliceByName(excludes = Seq(id)), Seq(targetIndex-1), summarizedColumns, DataSetType.Train, columnsSize),
 45 |       createLabeledPointFromRDD(test, Seq(idIndex), summarizedColumns, DataSetType.Test, columnsSize))
 46 |   }
 47 | 
 48 |   def createLabeledPointFromRDD(schemaRDD: Dataset, target: Seq[Int], datasetType: DataSetType.Types): RDD[(Map[Double,Any], LabeledPoint)] = {
 49 |     createLabeledPointFromRDD(schemaRDD, target, schemaRDD.summarizedColumns, datasetType, schemaRDD.columnsSize.toInt - 1)
 50 |   }
 51 | 
 52 |   def createLabeledPointFromRDD(rdd: Dataset, target: Seq[Int],
 53 |                                 normalized: RDD[(Int, (Int, (Any => Int), (Any => Double)))],
 54 |                                 dataSetType: DataSetType.Types, columnsSize: Int): RDD[(Map[Double,Any], LabeledPoint)] = {
 55 |     val (fields, idField) = rdd.schema.fields.zipWithIndex.partition(f => !target.contains(f._2))
 56 |     val normalizedStrings = rdd.context.broadcast(normalized.collectAsMap())
 57 |     rdd.zipWithIndex.map {
 58 |       case (row, rowIndex) =>
 59 |         val norm = normalizedStrings.value
 60 |         val normValues = fields.map {
 61 |           case (fieldType, index) =>
 62 |             norm.get(index).map {
 63 |               f =>
 64 |                 (f._1, f._2.apply(row(index)), f._3.apply(row(index)))
 65 |             }.getOrElse(
 66 |                 throw new UnexpectedValueException(s"Unexpected String Value exception ${row(index)}"))
 67 |         }
 68 | 
 69 |         val (_, indexes, values) = normValues.tail.scanLeft((normValues.head))((b, a) => (b._1 + a._1, (b._1 + a._2), a._3)).filter(_._3 != 0).unzip3
 70 |         val rowIndexD = rowIndex.toDouble +1
 71 |         (idField.head._1.dataType) match {
 72 |           case (StringType) => {
 73 |             dataSetType match {
 74 |               case DataSetType.Test  => (Map(rowIndexD -> row(target.head)), LabeledPoint(rowIndexD,Vectors.sparse(columnsSize, indexes.toArray, values.toArray)))
 75 |               case DataSetType.Train => (Map(rowIndexD -> row(target.head)), LabeledPoint(rowIndexD,Vectors.sparse(columnsSize, indexes.toArray, values.toArray)))
 76 |             }
 77 |           }
 78 |           case _ => {
 79 |             dataSetType match {
 80 |               case DataSetType.Train => (Map(rowIndexD -> row(target.head) ), LabeledPoint(toDouble(row(target.head)), Vectors.sparse(columnsSize, indexes.toArray, values.toArray)))
 81 |               case DataSetType.Test  => (Map(rowIndexD -> row(target.head) ), LabeledPoint(rowIndexD, Vectors.sparse(columnsSize, indexes.toArray, values.toArray)))
 82 |             }
 83 |           }
 84 |         }
 85 |     }
 86 |   }
 87 | 
 88 |   def extractStringsFromTrainTestSchema(trainDataSet: Dataset, testDataSet: Dataset, target: Seq[Int]): Dataset = {
 89 |     val rdd = trainDataSet.slice(excludes = target)
 90 |     rdd.unionAll(testDataSet)
 91 |   }
 92 | 
 93 |   def toDouble(toConvert: Any): Double = {
 94 |     toConvert match {
 95 |       case v: Int => v.toDouble
 96 |       case v: Long => v.toDouble
 97 |       case v: BigDecimal => v.toDouble
 98 |       case v: Double => v
 99 |       case v: Timestamp => (v.getTime / 3600000).toDouble
100 |       case v: String => v.toDouble
101 |       case v: Byte => v.toDouble
102 |       case v: Boolean => v match {
103 |         case true => 1d
104 |         case false => 0d
105 |       }
106 |       case _ => throw new Exception(toConvert.toString)
107 |     }
108 |   }
109 | 
110 |   def mapStringIdsToInt(rdd: SchemaRDD, columns: Seq[String]): Seq[Int] = rdd.schema.fields.zipWithIndex.
111 |     filter(f => columns.contains(f._1.name)).map(_._2)
112 | 
113 | 
114 |   def mapIdsToInt(rdd: SchemaRDD, columns: Seq[Union[IS]]): Seq[Int] = {
115 |     columns.headOption.map {
116 |       _.value[Int].map {
117 |         _ => columns.map(_.value[Int].get)
118 |       }.getOrElse {
119 |         mapStringIdsToInt(rdd, columns.map(_.value[String].get))
120 |       }
121 |     }.getOrElse(Seq.empty[Int])
122 |   }
123 | }
124 | 
125 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/util/DateTimeParser.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2015 eleflow.com.br.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | * http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | */
 16 | package eleflow.sparknotebook.util
 17 | 
 18 | /**
 19 |  * Created by dirceu on 24/02/15.
 20 |  */
 21 | import java.nio.charset.StandardCharsets
 22 | import eleflow.sparknotebook.enums.PeriodOfDay
 23 | import org.apache.spark.SparkFiles
 24 | 
 25 | import scala.collection.JavaConversions._
 26 | import java.nio.file.{FileSystems, Files}
 27 | import java.text.ParseException
 28 | 
 29 | import org.joda.time.{ DateTime}
 30 | import org.joda.time.format.DateTimeFormat
 31 | 
 32 | import scala.util.{Success, Try}
 33 | 
 34 | object DateTimeParser extends Serializable {
 35 | 
 36 |   def parse(dateString: String): Option[DateTime] = {
 37 |     val dateFormat: Option[String] = readDateFormat.orElse(determineDateFormat(dateString))
 38 |     dateFormat.flatMap { f =>
 39 |       Try {
 40 |         parse(dateString, dateFormat)
 41 |       } match {
 42 |         case Success(s) => s
 43 |         case _ => None
 44 |       }
 45 |     }
 46 |   }
 47 | 
 48 |   def parse(dateString: String, dateFormat: String): Option[DateTime] = {
 49 |     val formatter = DateTimeFormat.forPattern(dateFormat).withZoneUTC()
 50 |     return Some(formatter.parseDateTime(dateString))
 51 |   }
 52 | 
 53 |   def parse(dateString: String, dateFormatOption: Option[String]): Option[DateTime] = {
 54 |     dateFormatOption match {
 55 |       case Some(dateFormat) =>
 56 |         parse(dateString, dateFormat)
 57 |       case None =>
 58 |         parse(dateString)
 59 |     }
 60 | 
 61 |   }
 62 | 
 63 |   def isValidDate(dateString: String): Boolean = parse(dateString).isDefined
 64 | 
 65 |   def isValidDate(dateString: String, dateFormat: String): Boolean = {
 66 |     try {
 67 |       parse(dateString, dateFormat)
 68 |       return true
 69 |     }
 70 |     catch {
 71 |       case e: ParseException => {
 72 |         return false
 73 |       }
 74 |     }
 75 |   }
 76 | 
 77 |   def determineDateFormat(dateString: String): Option[String] = DATE_FORMAT_REGEXPS.keySet.filter(
 78 |     regexp => dateString.toLowerCase.matches(regexp)).headOption.flatMap(DATE_FORMAT_REGEXPS.get(_))
 79 | 
 80 |   private final val DATE_FORMAT_REGEXPS: Map[String, String] = Map(
 81 |     "^\\d{8}$" -> "yyyyMMdd",
 82 |     """^\d{1,2}-\d{1,2}-\d{4}$""" -> "dd-MM-yyyy",
 83 |     """^\d{4}-\d{1,2}-\d{1,2}$""" -> "yyyy-MM-dd",
 84 |     """^\d{1,2}/\d{1,2}/\d{4}$""" -> "MM/dd/yyyy",
 85 |     """^\d{4}/\d{1,2}/\d{1,2}$""" -> "yyyy/MM/dd",
 86 |     """^\d{1,2}\s[a-z]{3}\s\d{4}$""" -> "dd MMM yyyy",
 87 |     """^\d{1,2}\s[a-z]{4,}\s\d{4}$""" -> "dd MMMM yyyy",
 88 |     """^\d{12}$""" -> """yyyyMMddHHmm""",
 89 |     """^\d{8}\s\d{4}$""" -> """yyyyMMdd HHmm""",
 90 |     """^\d{1,2}-\d{1,2}-\d{4}\s\d{1,2}:\d{2}$""" -> "dd-MM-yyyy HH:mm",
 91 |     """^\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{2}$""" -> "yyyy-MM-dd HH:mm",
 92 |     """^\d{1,2}/\d{1,2}/\\d{4}\s\d{1,2}:\d{2}$""" -> "MM/dd/yyyy HH:mm",
 93 |     """^\d{4}/\d{1,2}/\\d{1,2}\s\d{1,2}:\d{2}$""" -> "yyyy/MM/dd HH:mm",
 94 |     """^\d{1,2}\s[a-z]{3}\s\d{4}\s\d{1,2}:\d{2}$""" -> "dd MMM yyyy HH:mm",
 95 |     """^\d{1,2}\s[a-z]{4,}\s\d{4}\s\d{1,2}:\d{2}$""" -> "dd MMMM yyyy HH:mm",
 96 |     """^\d{14}$""" -> """yyyyMMddHHmmss""",
 97 |     """^\d{8}\\s\d{6}$""" -> """yyyyMMdd HHmmss""",
 98 |     """^\d{1,2}-\d{1,2}-\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd-MM-yyyy HH:mm:ss",
 99 |     """^\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{2}:\d{2}$""" -> "yyyy-MM-dd HH:mm:ss",
100 |     """^\d{1,2}/\d{1,2}/\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "MM/dd/yyyy HH:mm:ss",
101 |     """^\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{2}:\d{2}$""" -> "yyyy/MM/dd HH:mm:ss",
102 |     """^\d{1,2}\s[a-z]{3}\s\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd MMM yyyy HH:mm:ss",
103 |     """^\d{1,2}\s[a-z]{4,}\s\d{4}\s\d{1,2}:\d{2}:\d{2}$""" -> "dd MMMM yyyy HH:mm:ss")
104 | 
105 |   def period(date: DateTime): PeriodOfDay.PeriodOfDay = {
106 |     date.getHourOfDay() match {
107 |       case hour if (hour < 6) => PeriodOfDay.Dawn
108 |       case hour if (hour < 12) => PeriodOfDay.Morning
109 |       case hour if (hour < 18) => PeriodOfDay.Afternoon
110 |       case _ => PeriodOfDay.Evening
111 |     }
112 |   }
113 | 
114 |   lazy val dateFormatFilePath = FileSystems.getDefault().getPath(SparkNotebookConfig.tempFolder, SparkNotebookConfig.propertyFolder,
115 |     SparkNotebookConfig.dateFormatFileName)
116 | 
117 |   private lazy val propertyFolderPath = FileSystems.getDefault.getPath(SparkNotebookConfig.tempFolder, SparkNotebookConfig.propertyFolder)
118 | 
119 |   def applyDateFormat(dateFormat: String) = {
120 |     if (Files.notExists(propertyFolderPath)) {
121 |       Files.createDirectory(propertyFolderPath)
122 |     }
123 |     Files.deleteIfExists(dateFormatFilePath)
124 |     Files.write(dateFormatFilePath, dateFormat.getBytes)
125 |   }
126 | 
127 |   private def readDateFormat = {
128 |     val clusterFilePath = FileSystems.getDefault.getPath(SparkFiles.get(SparkNotebookConfig.dateFormatFileName))
129 |     if (Files.exists(clusterFilePath)) Files.readAllLines(clusterFilePath, StandardCharsets.UTF_8).headOption
130 |     else None
131 |   }
132 | 
133 | }
134 | 
135 | /*
136 | * Licensed to the Apache Software Foundation (ASF) under one or more
137 | * contributor license agreements.  See the NOTICE file distributed with
138 | * this work for additional information regarding copyright ownership.
139 | * The ASF licenses this file to You under the Apache License, Version 2.0
140 | * (the "License"); you may not use this file except in compliance with
141 | * the License.  You may obtain a copy of the License at
142 | *
143 | *    http://www.apache.org/licenses/LICENSE-2.0
144 | *
145 | * Unless required by applicable law or agreed to in writing, software
146 | * distributed under the License is distributed on an "AS IS" BASIS,
147 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148 | * See the License for the specific language governing permissions and
149 | * limitations under the License.
150 | */
151 | final class DateTimeParser {
152 | 
153 | }


--------------------------------------------------------------------------------
/sbt/sbt/bin/sbt-launch-lib.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | 
  4 | # A library to simplify using the SBT launcher from other packages.
  5 | # Note: This should be used by tools like giter8/conscript etc.
  6 | 
  7 | # TODO - Should we merge the main SBT script with this library?
  8 | 
  9 | declare -a residual_args
 10 | declare -a java_args
 11 | declare -a scalac_args
 12 | declare -a sbt_commands
 13 | declare java_cmd=java
 14 | declare java_version
 15 | declare -r sbt_bin_dir="$(dirname "$(realpath "$0")")"
 16 | declare -r sbt_home="$(dirname "$sbt_bin_dir")"
 17 | 
 18 | echoerr () {
 19 |   echo 1>&2 "$@"
 20 | }
 21 | vlog () {
 22 |   [[ $verbose || $debug ]] && echoerr "$@"
 23 | }
 24 | dlog () {
 25 |   [[ $debug ]] && echoerr "$@"
 26 | }
 27 | 
 28 | jar_file () {
 29 |   echo "$(cygwinpath "${sbt_home}/bin/sbt-launch.jar")"
 30 | }
 31 | 
 32 | acquire_sbt_jar () {
 33 |   sbt_jar="$(jar_file)"
 34 | 
 35 |   if [[ ! -f "$sbt_jar" ]]; then
 36 |     echoerr "Could not find launcher jar: $sbt_jar"
 37 |     exit 2
 38 |   fi
 39 | }
 40 | 
 41 | execRunner () {
 42 |   # print the arguments one to a line, quoting any containing spaces
 43 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
 44 |     for arg; do
 45 |       if printf "%s\n" "$arg" | grep -q ' '; then
 46 |         printf "\"%s\"\n" "$arg"
 47 |       else
 48 |         printf "%s\n" "$arg"
 49 |       fi
 50 |     done
 51 |     echo ""
 52 |   }
 53 | 
 54 |   # THis used to be exec, but we loose the ability to re-hook stty then
 55 |   # for cygwin...  Maybe we should flag the feature here...
 56 |   "$@"
 57 | }
 58 | 
 59 | addJava () {
 60 |   dlog "[addJava] arg = '$1'"
 61 |   java_args=( "${java_args[@]}" "$1" )
 62 | }
 63 | addSbt () {
 64 |   dlog "[addSbt] arg = '$1'"
 65 |   sbt_commands=( "${sbt_commands[@]}" "$1" )
 66 | }
 67 | addResidual () {
 68 |   dlog "[residual] arg = '$1'"
 69 |   residual_args=( "${residual_args[@]}" "$1" )
 70 | }
 71 | addDebugger () {
 72 |   addJava "-agentlib:jdwp:transport=dt_socket,server=y,suspend=n,address=$1"
 73 | }
 74 | 
 75 | get_mem_opts () {
 76 |   # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings.
 77 |   # The reason is the Xms/Xmx, if they don't line up, cause errors.
 78 |   if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
 79 |      echo ""
 80 |   else
 81 |     # a ham-fisted attempt to move some memory settings in concert
 82 |     # so they need not be messed around with individually.
 83 |     local mem=${1:-1024}
 84 |     local codecache=$(( $mem / 8 ))
 85 |     (( $codecache > 128 )) || codecache=128
 86 |     (( $codecache < 512 )) || codecache=512
 87 |     local class_metadata_size=$(( $codecache * 2 ))
 88 |     local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize")
 89 | 
 90 |     echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m"
 91 |   fi
 92 | }
 93 | 
 94 | require_arg () {
 95 |   local type="$1"
 96 |   local opt="$2"
 97 |   local arg="$3"
 98 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
 99 |     echo "$opt requires <$type> argument"
100 |     exit 1
101 |   fi
102 | }
103 | 
104 | is_function_defined() {
105 |   declare -f "$1" > /dev/null
106 | }
107 | 
108 | process_args () {
109 |   while [[ $# -gt 0 ]]; do
110 |     case "$1" in
111 |        -h|-help) usage; exit 1 ;;
112 |     -v|-verbose) verbose=1 && shift ;;
113 |       -d|-debug) debug=1 && shift ;;
114 | 
115 |            -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
116 |            -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
117 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
118 |          -batch) exec </dev/null && shift ;;
119 | 
120 |        -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
121 |    -sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
122 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
123 | 
124 |             -D*) addJava "$1" && shift ;;
125 |             -J*) addJava "${1:2}" && shift ;;
126 |               *) addResidual "$1" && shift ;;
127 |     esac
128 |   done
129 |   
130 |   is_function_defined process_my_args && {
131 |     myargs=("${residual_args[@]}")
132 |     residual_args=()
133 |     process_my_args "${myargs[@]}"
134 |   }
135 | 
136 |   java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}')
137 |   vlog "[process_args] java_version = '$java_version'"
138 | }
139 | 
140 | # Detect that we have java installed.
141 | checkJava() {
142 |   local required_version="$1"
143 |   # Now check to see if it's a good enough version
144 |   if [[ "$java_version" == "" ]]; then
145 |     echo
146 |     echo No java installations was detected.
147 |     echo Please go to http://www.java.com/getjava/ and download
148 |     echo
149 |     exit 1
150 |   elif [[ ! "$java_version" > "$required_version" ]]; then
151 |     echo
152 |     echo The java installation you have is not up to date
153 |     echo $script_name requires at least version $required_version+, you have
154 |     echo version $java_version
155 |     echo
156 |     echo Please go to http://www.java.com/getjava/ and download
157 |     echo a valid Java Runtime and install before running $script_name.
158 |     echo
159 |     exit 1
160 |   fi
161 | }
162 | 
163 | 
164 | run() {
165 |   # no jar? download it.
166 |   [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
167 |     # still no jar? uh-oh.
168 |     echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
169 |     exit 1
170 |   }
171 | 
172 |   # process the combined args, then reset "$@" to the residuals
173 |   process_args "$@"
174 |   set -- "${residual_args[@]}"
175 |   argumentCount=$#
176 | 
177 |   # TODO - java check should be configurable...
178 |   checkJava "1.6"
179 | 
180 |   #If we're in cygwin, we should use the windows config, and terminal hacks
181 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
182 |     stty -icanon min 1 -echo > /dev/null 2>&1
183 |     addJava "-Djline.terminal=jline.UnixTerminal"
184 |     addJava "-Dsbt.cygwin=true"
185 |   fi
186 |   
187 |   # run sbt
188 |   execRunner "$java_cmd" \
189 |     ${SBT_OPTS:-$default_sbt_opts} \
190 |     $(get_mem_opts $sbt_mem) \
191 |   	  ${JAVA_OPTS} \
192 |     ${java_args[@]} \
193 |     -jar "$sbt_jar" \
194 |     "${sbt_commands[@]}" \
195 |     "${residual_args[@]}"  
196 |   
197 |   exit_code=$?
198 | 
199 |   # Clean up the terminal from cygwin hacks.
200 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
201 |     stty icanon echo > /dev/null 2>&1
202 |   fi
203 |   exit $exit_code
204 | }
205 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/src/templates/bash-template:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | #
  4 | # Copyright 2015 eleflow.com.br.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | 
 19 | ###  ------------------------------- ###
 20 | ###  Helper methods for BASH scripts ###
 21 | ###  ------------------------------- ###
 22 | 
 23 | die() {
 24 |   echo "$@" 1>&2
 25 |   exit 1
 26 | }
 27 | 
 28 | realpath () {
 29 | (
 30 |   TARGET_FILE="$1"
 31 |   CHECK_CYGWIN="$2"
 32 | 
 33 |   cd "$(dirname "$TARGET_FILE")"
 34 |   TARGET_FILE=$(basename "$TARGET_FILE")
 35 | 
 36 |   COUNT=0
 37 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 38 |   do
 39 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 40 |       cd "$(dirname "$TARGET_FILE")"
 41 |       TARGET_FILE=$(basename "$TARGET_FILE")
 42 |       COUNT=$(($COUNT + 1))
 43 |   done
 44 | 
 45 |   if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then
 46 |     cd "$TARGET_FILE"
 47 |     TARGET_FILEPATH=
 48 |   else
 49 |     TARGET_FILEPATH=/$TARGET_FILE
 50 |   fi
 51 | 
 52 |   # make sure we grab the actual windows path, instead of cygwin's path.
 53 |   if [[ "x$CHECK_CYGWIN" == "x" ]]; then
 54 |     echo "$(pwd -P)/$TARGET_FILE"
 55 |   else
 56 |     echo $(cygwinpath "$(pwd -P)/$TARGET_FILE")
 57 |   fi
 58 | )
 59 | }
 60 | 
 61 | # TODO - Do we need to detect msys?
 62 | 
 63 | # Uses uname to detect if we're in the odd cygwin environment.
 64 | is_cygwin() {
 65 |   local os=$(uname -s)
 66 |   case "$os" in
 67 |     CYGWIN*) return 0 ;;
 68 |     *)  return 1 ;;
 69 |   esac
 70 | }
 71 | 
 72 | # This can fix cygwin style /cygdrive paths so we get the
 73 | # windows style paths.
 74 | cygwinpath() {
 75 |   local file="$1"
 76 |   if is_cygwin; then
 77 |     echo $(cygpath -w $file)
 78 |   else
 79 |     echo $file
 80 |   fi
 81 | }
 82 | 
 83 | # Make something URI friendly
 84 | make_url() {
 85 |   url="$1"
 86 |   local nospaces=${url// /%20}
 87 |   if is_cygwin; then
 88 |     echo "/${nospaces//\\//}"
 89 |   else
 90 |     echo "$nospaces"
 91 |   fi
 92 | }
 93 | 
 94 | # This crazy function reads in a vanilla "linux" classpath string (only : are separators, and all /),
 95 | # and returns a classpath with windows style paths, and ; separators.
 96 | fixCygwinClasspath() {
 97 |   OLDIFS=$IFS
 98 |   IFS=":"
 99 |   read -a classpath_members <<< "$1"
100 |   declare -a fixed_members
101 |   IFS=$OLDIFS
102 |   for i in "${!classpath_members[@]}"
103 |   do
104 |     fixed_members[i]=$(realpath "${classpath_members[i]}" "fix")
105 |   done
106 |   IFS=";"
107 |   echo "${fixed_members[*]}"
108 |   IFS=$OLDIFS
109 | }
110 | 
111 | # Fix the classpath we use for cygwin.
112 | fix_classpath() {
113 |   cp="$1"
114 |   if is_cygwin; then
115 |     echo "$(fixCygwinClasspath "$cp")"
116 |   else
117 |     echo "$cp"
118 |   fi
119 | }
120 | # Detect if we should use JAVA_HOME or just try PATH.
121 | get_java_cmd() {
122 |   if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]];  then
123 |     echo "$JAVA_HOME/bin/java"
124 |   else
125 |     echo "java"
126 |   fi
127 | }
128 | 
129 | echoerr () {
130 |   echo 1>&2 "$@"
131 | }
132 | vlog () {
133 |   [[ $verbose || $debug ]] && echoerr "$@"
134 | }
135 | dlog () {
136 |   [[ $debug ]] && echoerr "$@"
137 | }
138 | execRunner () {
139 |   # print the arguments one to a line, quoting any containing spaces
140 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
141 |     for arg; do
142 |       if printf "%s\n" "$arg" | grep -q ' '; then
143 |         printf "\"%s\"\n" "$arg"
144 |       else
145 |         printf "%s\n" "$arg"
146 |       fi
147 |     done
148 |     echo ""
149 |   }
150 | 
151 |   # we use "exec" here for our pids to be accurate.
152 |   exec "$@"
153 | }
154 | addJava () {
155 |   dlog "[addJava] arg = '$1'"
156 |   java_args+=( "$1" )
157 | }
158 | addApp () {
159 |   dlog "[addApp] arg = '$1'"
160 |   app_commands+=( "$1" )
161 | }
162 | addResidual () {
163 |   dlog "[residual] arg = '$1'"
164 |   residual_args+=( "$1" )
165 | }
166 | addDebugger () {
167 |   addJava "-Xdebug"
168 |   addJava "-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
169 | }
170 | addMemory () {
171 |   addJava "-Xmx"
172 |   addJava "28g"
173 | }
174 | 
175 | # a ham-fisted attempt to move some memory settings in concert
176 | # so they need not be messed around with individually.
177 | get_mem_opts () {
178 |   local mem=${1:-2560}
179 |   local perm=$(( $mem / 4 ))
180 |   (( $perm > 256 )) || perm=256
181 |   (( $perm < 1024 )) || perm=1024
182 |   local codecache=$(( $perm / 2 ))
183 | 
184 |   # if we detect any of these settings in ${java_opts} we need to NOT output our settings.
185 |   # The reason is the Xms/Xmx, if they don't line up, cause errors.
186 |   if [[ "${java_opts}" == *-Xmx* ]] ||
187 |      [[ "${java_opts}" == *-Xms* ]] ||
188 |      [[ "${java_opts}" == *-XX:MaxPermSize* ]] ||
189 |      [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] ||
190 |      # check java arguments for settings, too
191 |      [[ "${java_args[@]}" == *-Xmx* ]] ||
192 |      [[ "${java_args[@]}" == *-Xms* ]] ||
193 |      [[ "${java_args[@]}" == *-XX:MaxPermSize* ]] ||
194 |      [[ "${java_args[@]}" == *-XX:ReservedCodeCacheSize* ]];
195 |   then
196 |     echo ""
197 |   elif [[ !$no_version_check ]] && [[ "$java_version" > "1.8" ]]; then
198 |     echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m"
199 |   else
200 |     echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
201 |   fi
202 | }
203 | require_arg () {
204 |   local type="$1"
205 |   local opt="$2"
206 |   local arg="$3"
207 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
208 |     die "$opt requires <$type> argument"
209 |   fi
210 | }
211 | is_function_defined() {
212 |   declare -f "$1" > /dev/null
213 | }
214 | 
215 | # Attempt to detect if the script is running via a GUI or not
216 | # TODO - Determine where/how we use this generically
217 | detect_terminal_for_ui() {
218 |   [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
219 |     echo "true"
220 |   }
221 |   # SPECIAL TEST FOR MAC
222 |   [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
223 |     echo "true"
224 |   }
225 | }
226 | 
227 | # Processes incoming arguments and places them in appropriate global variables.  called by the run method.
228 | process_args () {
229 |   local no_more_snp_opts=0
230 |   while [[ $# -gt 0 ]]; do
231 |     case "$1" in
232 |              --) shift && no_more_snp_opts=1 && break ;;
233 |        -h|-help) usage; exit 1 ;;
234 |     -v|-verbose) verbose=1 && shift ;;
235 |       -d|-debug) debug=1 && shift ;;
236 | 
237 |     -no-version-check) no_version_check=1 && shift ;;
238 | 
239 |            -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;;
240 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
241 | 
242 |           -main) custom_mainclass="$2" && shift 2 ;;
243 | 
244 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
245 | 
246 |             -D*) addJava "$1" && shift ;;
247 |             -J*) addJava "${1:2}" && shift ;;
248 |               *) addResidual "$1" && shift ;;
249 |     esac
250 |   done
251 | 
252 |   if [[ no_more_snp_opts ]]; then
253 |     while [[ $# -gt 0 ]]; do
254 |       addResidual "$1" && shift
255 |     done
256 |   fi
257 | 
258 |   is_function_defined process_my_args && {
259 |     myargs=("${residual_args[@]}")
260 |     residual_args=()
261 |     process_my_args "${myargs[@]}"
262 |   }
263 | }
264 | 
265 | # Actually runs the script.
266 | run() {
267 |   # TODO - check for sane environment
268 | 
269 |   # process the combined args, then reset "$@" to the residuals
270 |   process_args "$@"
271 |   set -- "${residual_args[@]}"
272 |   argumentCount=$#
273 | 
274 |   #check for jline terminal fixes on cygwin
275 |   if is_cygwin; then
276 |     stty -icanon min 1 -echo > /dev/null 2>&1
277 |     addJava "-Djline.terminal=jline.UnixTerminal"
278 |     addJava "-Dsbt.cygwin=true"
279 |   fi
280 | 
281 |   # check java version
282 |   if [[ ! $no_version_check ]]; then
283 |     java_version_check
284 |   fi
285 | 
286 |   if [ -n "$custom_mainclass" ]; then
287 |     mainclass="$custom_mainclass"
288 |   else
289 |     mainclass="$app_mainclass"
290 |   fi
291 | 
292 |   # Now we check to see if there are any java opts on the environment. These get listed first, with the script able to override them.
293 |   if [[ "$JAVA_OPTS" != "" ]]; then
294 |     java_opts="${JAVA_OPTS}"
295 |   fi
296 | 
297 |   # run sbt
298 |   execRunner "$java_cmd" \
299 |     $(get_mem_opts $app_mem) \
300 |     ${java_opts[@]} \
301 |     "${java_args[@]}" \
302 |     -cp "$(fix_classpath "${app_home}/../ec2:$lib_dir/*:/opt/spark/lib/*:$lib_dir/../*")" \
303 |     $mainclass \
304 |     "${app_commands[@]}" \
305 |     "${residual_args[@]}"
306 | 
307 |   local exit_code=$?
308 |   if is_cygwin; then
309 |     stty icanon echo > /dev/null 2>&1
310 |   fi
311 |   exit $exit_code
312 | }
313 | 
314 | # Loads a configuration file full of default command line options for this script.
315 | loadConfigFile() {
316 |   cat "$1" | sed '/^\#/d'
317 | }
318 | 
319 | # Now check to see if it's a good enough version
320 | # TODO - Check to see if we have a configured default java version, otherwise use 1.6
321 | java_version_check() {
322 |   readonly java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}')
323 |   if [[ "$java_version" == "" ]]; then
324 |     echo
325 |     echo No java installations was detected.
326 |     echo Please go to http://www.java.com/getjava/ and download
327 |     echo
328 |     exit 1
329 |   elif [[ ! "$java_version" > "1.6" ]]; then
330 |     echo
331 |     echo The java installation you have is not up to date
332 |     echo $app_name requires at least version 1.6+, you have
333 |     echo version $java_version
334 |     echo
335 |     echo Please go to http://www.java.com/getjava/ and download
336 |     echo a valid Java Runtime and install before running $app_name.
337 |     echo
338 |     exit 1
339 |   fi
340 | }
341 | 
342 | ###  ------------------------------- ###
343 | ###  Start of customized settings    ###
344 | ###  ------------------------------- ###
345 | usage() {
346 |  cat <<EOM
347 | Usage: $script_name [options]
348 | 
349 |   -h | -help         print this message
350 |   -v | -verbose      this runner is chattier
351 |   -d | -debug        set sbt log level to debug
352 |   -no-version-check  Don't run the java version check.
353 |   -mem <integer>     set memory options in MB (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
354 |   -main <classname>  Define a custom main class
355 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
356 | 
357 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
358 |   -java-home <path>         alternate JAVA_HOME
359 | 
360 |   # jvm options and output control
361 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
362 |   -Dkey=val          pass -Dkey=val directly to the java runtime
363 |   -J-X               pass option -X directly to the java runtime
364 |                      (-J is stripped)
365 | 
366 |   # special option
367 |   --                 To stop parsing built-in commands from the rest of the command-line.
368 |                      e.g.) enabling debug and sending -d as app argument
369 |                      \$ ./start-script -d -- -d
370 | 
371 | In the case of duplicated or conflicting options, basically the order above
372 | shows precedence: JAVA_OPTS lowest, command line options highest except "--".
373 | EOM
374 | }
375 | 
376 | ###  ------------------------------- ###
377 | ###  Main script                     ###
378 | ###  ------------------------------- ###
379 | 
380 | declare -a residual_args
381 | declare -a java_args
382 | declare -a app_commands
383 | declare -r real_script_path="$(realpath "$0")"
384 | declare -r app_home="$(realpath "$(dirname "$real_script_path")")"
385 | # TODO - Check whether this is ok in cygwin...
386 | declare -r lib_dir="$(realpath "${app_home}/../lib")"
387 | ${{template_declares}}
388 | # java_cmd is overrode in process_args when -java-home is used
389 | declare java_cmd=$(get_java_cmd)
390 | 
391 | # if configuration files exist, prepend their contents to $@ so it can be processed by this runner
392 | [[ -f "$script_conf_file" ]] && set -- $(loadConfigFile "$script_conf_file") "$@"
393 | 
394 | run "$@"
395 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/SparkNotebookContext.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2015 eleflow.com.br.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | * http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | */
 16 | package eleflow.sparknotebook
 17 | 
 18 | import java.io.{FileNotFoundException, InputStream, OutputStream}
 19 | import java.net.URI
 20 | import com.amazonaws.services.s3.model.{GetObjectRequest, ObjectMetadata, PutObjectRequest, S3Object}
 21 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3Client}
 22 | import eleflow.sparknotebook.data.Dataset
 23 | 
 24 | import org.apache.commons.io.IOUtils
 25 | import org.apache.hadoop.conf.Configuration
 26 | import org.apache.hadoop.fs.{FileStatus, FileUtil, FileSystem, Path}
 27 | import org.apache.spark.sql.hive.HiveContext
 28 | import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
 29 | import org.apache.spark.{Logging, SparkConf, SparkContext}
 30 | 
 31 | import scala.annotation.tailrec
 32 | import scala.sys.process._
 33 | import scala.util.Try
 34 | import scala.util.matching.Regex
 35 | 
 36 | object ClusterSettings {
 37 | 
 38 |   var kryoBufferMaxSize: Option[String] = None
 39 |   var maxResultSize = "2g"
 40 |   var masterInstanceType = "r3.large"
 41 |   var coreInstanceType = "r3.large"
 42 |   var coreInstanceCount = 3
 43 |   var spotPriceFactor: Option[String] = Some("1.3")
 44 |   var ec2KeyName: Option[String] = None
 45 |   var hadoopVersion = "2"
 46 |   var clusterName = "SparkNotebookCluster"
 47 |   var region: Option[String] = None
 48 |   var profile: Option[String] = None
 49 |   var resume = false
 50 |   var executorMemory: Option[String] = None
 51 |   var defaultParallelism: Option[Int] = None
 52 |   var master: Option[String] = None
 53 | 
 54 |   def slavesCores = ClusterSettings.coreInstanceType match {
 55 |     case s: String if s.endsWith("xlarge") => 4
 56 |     case s: String if s.endsWith("2xlarge") => 8
 57 |     case s: String if s.endsWith("4xlarge") => 16
 58 |     case s: String if s.endsWith("8xlarge") => 32
 59 |     case _ => 2
 60 |   }
 61 | 
 62 |   def getNumberOfCores = ClusterSettings.coreInstanceCount * slavesCores
 63 | }
 64 | 
 65 | 
 66 | /**
 67 |  * User: paulomagalhaes
 68 |  * Date: 8/15/14 12:24 PM
 69 |  */
 70 | 
 71 | class SparkNotebookContext(@transient sparkConf: SparkConf) extends Serializable with Logging {
 72 |   val version = SparkNotebookVersion.version
 73 | 
 74 |   protected def this(sparkConf: SparkConf, data: String) = this(sparkConf)
 75 | 
 76 |   @transient protected lazy val s3Client: AmazonS3 = new AmazonS3Client()
 77 |   @transient protected var sc: Option[SparkContext] = None
 78 |   @transient var _sqlContext: Option[HiveContext] = None
 79 |   private var _masterHost: Option[String] = None
 80 |   protected val basePath: String = "/"
 81 | 
 82 |   def sparkContext(): SparkContext = sc getOrElse {
 83 |     val context = if (ClusterSettings.master.isDefined) createSparkContextForProvisionedCluster(sparkConf)
 84 |     else createSparkContextForNewCluster(sparkConf)
 85 |     addClasspathToSparkContext(context)
 86 |     sc = Some(context)
 87 |     context
 88 |   }
 89 | 
 90 |   def addClasspathToSparkContext(context: SparkContext) {
 91 |     val jodaJar = "joda-time.joda-time-.*jar".r
 92 |     val sparkNotebookContextJar = "eleflow.sparknotebook-.*jar".r
 93 |     val guavaJar = "com.google.guava.*".r
 94 |     val mySqlDriver = "mysql-connector-java.*".r
 95 |     val urls = this.getClass().getClassLoader().asInstanceOf[java.net.URLClassLoader].getURLs
 96 |     val jarUrls = urls.filter(url =>
 97 |       jodaJar.findFirstIn(url.getFile) != None
 98 |         || sparkNotebookContextJar.findFirstIn(url.getFile) != None
 99 |         || guavaJar.findFirstIn(url.getFile) != None
100 |         || mySqlDriver.findFirstIn(url.getFile) != None)
101 |     jarUrls.foreach { url =>
102 |       logInfo(s"adding ${url.getPath} to spark context jars")
103 |       context.addJar(url.getPath)
104 |     }
105 |   }
106 | 
107 |   def masterHost(): String = {
108 |     return _masterHost match {
109 |       case Some(host) => host
110 |       case None => {
111 |         initHostNames
112 |         _masterHost.get
113 |       }
114 |     }
115 |   }
116 | 
117 |   def initHostNames {
118 |     _masterHost = createCluster();
119 |   }
120 | 
121 |   def masterHost_=(host: String): Unit = _masterHost = Some(host)
122 | 
123 |   def sqlContext(): HiveContext = {
124 |     _sqlContext match {
125 |       case None => {
126 |         _sqlContext = Some(new HiveContext(sparkContext));
127 |         HiveThriftServer2.startWithContext(_sqlContext.get)
128 |         _sqlContext.get
129 |       }
130 |       case Some(ctx) => ctx
131 |     }
132 |   }
133 | 
134 |   def createSparkContextForNewCluster(conf: SparkConf): SparkContext = {
135 |     log.info(s"connecting to $masterHost")
136 |     conf.setMaster(s"spark://$masterHost:7077")
137 |     confSetup(conf)
138 |   }
139 | 
140 |   private def confSetup(conf: SparkConf): SparkContext = {
141 |     ClusterSettings.defaultParallelism.map(value => conf.set("spark.default.parallelism", value.toString))
142 |     ClusterSettings.kryoBufferMaxSize.map(value => conf.set("spark.kryoserializer.buffer.max.mb", value.toString))
143 |     //according to keo, in Making Sense of Spark Performance webcast, this codec is better than default
144 |     conf.set("spark.io.compression.codec", "lzf")
145 |     conf.set("spark.driver.maxResultSize", ClusterSettings.maxResultSize)
146 |     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
147 |     ClusterSettings.executorMemory.foreach(conf.set("spark.executor.memory", _))
148 | 
149 |     val defaultConfStream = this.getClass.getClassLoader.getResourceAsStream("spark-defaults.conf")
150 |     if (defaultConfStream != null) {
151 |       import scala.collection.JavaConversions._
152 |       val defaultConf = IOUtils.readLines(defaultConfStream)
153 |       defaultConf.map { line =>
154 |         val keyValue = line.split("\\s+")
155 |         if (keyValue.size == 2)
156 |           conf.set(keyValue(0), keyValue(1))
157 |       }
158 |     }
159 |     //according to keo, in Making Sense of Spark Performance webcast, this codec is better than default
160 |     conf.set("spark.io.compression.codec", "lzf")
161 | 
162 |     ClusterSettings.defaultParallelism.map(value => conf.set("spark.default.parallelism", value.toString))
163 |     ClusterSettings.kryoBufferMaxSize.map(value => conf.set("spark.kryoserializer.buffer.max.mb", value.toString))
164 | 
165 |     conf.set("spark.driver.maxResultSize", ClusterSettings.maxResultSize)
166 |     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
167 |     ClusterSettings.executorMemory.foreach(conf.set("spark.executor.memory", _))
168 |     println("sparkcontext")
169 |     new SparkContext(conf)
170 |   }
171 | 
172 |   def createSparkContextForProvisionedCluster(conf: SparkConf): SparkContext = {
173 |     log.info("connecting to localhost")
174 |     conf.setMaster(ClusterSettings.master.get)
175 |     confSetup(conf)
176 |   }
177 | 
178 |   def shellRun(command: Seq[String]) = {
179 |     val out = new StringBuilder
180 | 
181 |     val logger = ProcessLogger(
182 |       (o: String) => {
183 |         out.append(o);
184 |         logInfo(o)
185 |       },
186 |       (e: String) => {
187 |         println(e);
188 |         logInfo(e)
189 |       })
190 |     command ! logger
191 |     out.toString()
192 |   }
193 | 
194 |   def createCluster(): Option[String] = {
195 | 
196 |     val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath
197 |     import ClusterSettings._
198 |     val mandatory = Seq(path,
199 |       "--hadoop-major-version", hadoopVersion,
200 |       "--master-instance-type", masterInstanceType,
201 |       "--slaves", coreInstanceCount.toString,
202 |       "--instance-type", coreInstanceType)
203 |     val command = mandatory ++ (ec2KeyName match {
204 |       case None => Seq[String]()
205 |       case Some(ec2KeyName) => Seq("--key-pair", ec2KeyName)
206 |     }) ++ (spotPriceFactor match {
207 |       case None => Seq[String]()
208 |       case Some(spotPrice) => Seq("--spot-price", spotPrice.toString)
209 |     }) ++ (region match {
210 |       case None => Seq[String]()
211 |       case Some(region) => Seq("--region", region.toString)
212 |     }) ++ (profile match {
213 |       case None => Seq[String]()
214 |       case Some(profile) => Seq("--profile", profile.toString)
215 |     }) ++ (if (resume) Seq("--resume") else Seq())
216 | 
217 |     val output = shellRun((command ++ Seq("launch", clusterName)))
218 | 
219 |     val pattern = new Regex("Spark standalone cluster started at http://([^:]+):8080")
220 |     val host = pattern.findAllIn(output).matchData.map(_.group(1)).next
221 |     return Some(host)
222 |   }
223 | 
224 |   def terminate() {
225 |     clearContext
226 |     val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath
227 |     import ClusterSettings._
228 | 
229 |     val output = shellRun(Seq(path, "destroy", clusterName))
230 |     _masterHost = None
231 |     ClusterSettings.resume = false
232 |   }
233 | 
234 |   def clusterInfo() {
235 |     val path = getClass.getResource(s"${basePath}spark_ec2.py").getPath
236 |     import ClusterSettings._
237 |     val output = shellRun(Seq(path, "get-master", clusterName))
238 |   }
239 | 
240 |   def clearContext {
241 |     ClusterSettings.resume = true
242 |     sc.map {
243 |       f =>
244 |         f.cancelAllJobs()
245 |         f.stop()
246 |     }
247 |     _sqlContext = None
248 |     sc = None
249 |   }
250 | 
251 |   def reconnect(): Unit = {
252 |     sc.map(_.stop())
253 |     sc = None
254 |     _sqlContext = None
255 |   }
256 | 
257 |   def getAllFilesRecursively(path: Path): Seq[String] = {
258 |     val fs = path.getFileSystem(new Configuration)
259 |     @tailrec
260 |     def iter(fs: FileSystem, paths: Seq[Path], result: Seq[String]): Seq[String] = paths match {
261 |       case path :: tail =>
262 |         val children: Seq[FileStatus] = try {
263 |           fs.listStatus(path)
264 |         } catch {
265 |           case e: FileNotFoundException =>
266 |             // listStatus throws FNFE if the dir is empty
267 |             Seq.empty[FileStatus]
268 |         }
269 |         val (files, directories) = children.partition(_.isFile)
270 |         iter(fs, tail ++ directories.map(_.getPath), files.map(_.getPath.toString) ++ result)
271 |       case _ =>
272 |         result
273 |     }
274 |     iter(fs, Seq(path), Seq())
275 |   }
276 | 
277 |   def copyDir(input: String, output: String): Unit = {
278 |     val from = createPathInstance(input)
279 | 
280 |     val files = getAllFilesRecursively(from)
281 |     val to = output.replaceAll(new URI(input).getPath, "")
282 |     copyDir(files, to)
283 |   }
284 | 
285 |   def copyDir(inputFiles: Seq[String], output: String): Unit = {
286 |     sparkContext.parallelize(inputFiles).foreach { inputFile =>
287 |       val from = new URI(inputFile)
288 | 
289 |       copy(inputFile, s"$output/${from.getPath}")
290 |     }
291 |   }
292 | 
293 |   def copy(input: String, output: String): Unit = {
294 |     val from = new URI(input)
295 |     val to = new URI(output)
296 |     val fromScheme = from.getScheme
297 |     val toScheme = to.getScheme
298 |     val conf = new Configuration()
299 | 
300 |     (fromScheme, toScheme) match {
301 |       case ("s3n" | "s3", "s3n" | "s3") => ???
302 |       case (fromAddr, _) if (fromAddr.startsWith("s3")) => {
303 |         val outputPath = createPathInstance(output)
304 |         val fs = createPathInstance(output).getFileSystem(conf)
305 |         copyFromS3(from, outputPath, fs)
306 |       }
307 |       case _ => {
308 |         val srcPath = createPathInstance(input)
309 |         val srcFs = srcPath.getFileSystem(conf)
310 |         val dstPath = createPathInstance(output)
311 |         val dstFs = dstPath.getFileSystem(conf)
312 |         FileUtil.copy(srcFs, srcPath, dstFs, dstPath, false, conf)
313 |       }
314 |     }
315 |   }
316 | 
317 |   def fs(pathStr: String): FileSystem = {
318 |     val path = createPathInstance(pathStr)
319 |     path.getFileSystem(new Configuration)
320 |   }
321 | 
322 |   def sql(sql: String) = {
323 |     sqlContext().sql(sql)
324 |   }
325 | 
326 |   protected def copyFromS3(input: URI, path: Path, fs: FileSystem): Unit = {
327 |     val rangeObjectRequest: GetObjectRequest = new GetObjectRequest(input.getHost, input.getPath.substring(1))
328 |     val inputStream: Try[InputStream] = Try {
329 | 
330 |       val objectPortion: S3Object = s3Client.getObject(rangeObjectRequest)
331 |       objectPortion.getObjectContent()
332 |     }
333 |     inputStream.map {
334 |       in =>
335 |         val copyResult = Try(fs.create(path)).flatMap {
336 |           out =>
337 |             val copyResult = copyStreams(in, out)
338 |             out.close
339 |             copyResult
340 |         }
341 |         in.close
342 |         copyResult
343 |     }.recover {
344 |       case e: Exception => throw e
345 |     }
346 |   }
347 | 
348 |   protected def createPathInstance(input: String) = new Path(input)
349 | 
350 |   protected def copyStreams(in: InputStream, out: OutputStream) = Try(IOUtils.copy(in, out))
351 | 
352 |   protected def copyToS3(input: Path, bucket: String, fileName: String): Unit = {
353 | 
354 |     val objRequest = new PutObjectRequest(bucket, fileName, readFromHDFS(input), new ObjectMetadata())
355 |     s3Client.putObject(objRequest)
356 |   }
357 | 
358 |   private def readFromHDFS(input: Path) = {
359 |     val fs = input.getFileSystem(new Configuration)
360 |     fs.open(input)
361 |   }
362 | 
363 |   def load(file: String, separator: String = ",") = {
364 |     Dataset(this, file, separator)
365 |   }
366 | 
367 | }
368 | 
369 | 


--------------------------------------------------------------------------------
/src/main/scala/eleflow/sparknotebook/data/Dataset.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2015 eleflow.com.br.
  3 | *
  4 | * Licensed under the Apache License, Version 2.0 (the "License");
  5 | * you may not use this file except in compliance with the License.
  6 | * You may obtain a copy of the License at
  7 | *
  8 | * http://www.apache.org/licenses/LICENSE-2.0
  9 | *
 10 | * Unless required by applicable law or agreed to in writing, software
 11 | * distributed under the License is distributed on an "AS IS" BASIS,
 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | * See the License for the specific language governing permissions and
 14 | * limitations under the License.
 15 | */
 16 | package eleflow.sparknotebook.data
 17 | 
 18 | import java.net.URI
 19 | import java.sql.Timestamp
 20 | import eleflow.sparknotebook.enums.DataSetType
 21 | import eleflow.sparknotebook.{SparkNotebookContext, ClusterSettings}
 22 | import eleflow.sparknotebook.exception.{InvalidDataException, UnexpectedFileFormatException}
 23 | import eleflow.sparknotebook.util.DateTimeParser
 24 | import eleflow.sparknotebook.enums.DateSplitType._
 25 | import eleflow.sparknotebook.enums.DataSetType
 26 | import eleflow.sparknotebook.exception.InvalidDataException
 27 | import eleflow.sparknotebook.SparkNotebookContext
 28 | import eleflow.sparknotebook.util.DateTimeParser
 29 | import org.apache.spark.rdd.RDD
 30 | import org.apache.spark.sql._
 31 | import org.apache.spark.sql.catalyst.types.{DataType, StructField}
 32 | import org.apache.spark.storage.StorageLevel
 33 | import org.joda.time.{DateTime, DateTimeZone, Days}
 34 | 
 35 | import scala.collection.immutable.TreeSet
 36 | 
 37 | 
 38 | /**
 39 |  * SparkNotebook
 40 |  * Copyright (C) 2014 eleflow.
 41 |  * User: paulomagalhaes
 42 |  * Date: 11/4/14 3:44 PM
 43 |  */
 44 | 
 45 | object Dataset {
 46 |   implicit def DatasetToSchemaRdd(dataset: Dataset): SchemaRDD = dataset.toSchemaRDD()
 47 | 
 48 |   implicit def SchemaRddToDataset(schemaRdd: SchemaRDD): Dataset = new Dataset(schemaRdd)
 49 | 
 50 |   implicit def FileDatasetToDataset(fileDS: FileDataset): Dataset = new Dataset(fileDS.toSchemaRDD)
 51 | 
 52 |   implicit def FileDatasetToSchemaRdd(fileDS: FileDataset): SchemaRDD = fileDS.toSchemaRDD
 53 | 
 54 |   def apply(uc: SparkNotebookContext, file: String, separator: String = ",") = {
 55 |     new FileDataset(uc, file, separator)
 56 |   }
 57 | }
 58 | 
 59 | class Dataset private[data](schemaRdd: SchemaRDD, originalDataset: Option[Dataset] = None, defaultSummarizedColumns: Option[RDD[(Int, (Int, (Any) => Int, (Any) => Double))]] = None) extends Serializable {
 60 |   originalDataset.map(f => nameSchemaRDD(f.toSchemaRDD)).getOrElse(nameSchemaRDD(schemaRdd)).map(schemaRdd.registerTempTable(_))
 61 | 
 62 |   private def nameSchemaRDD(schemaRDD: SchemaRDD) = {
 63 |     schemaRDD.name match {
 64 |       case null => None
 65 |       case _ => Some(schemaRdd.name)
 66 |     }
 67 |   }
 68 | 
 69 |   import org.apache.spark.SparkContext._
 70 | 
 71 |   lazy val columnsSize = summarizedColumns.map(_._2._1).sum().toInt
 72 | 
 73 |   lazy val summarizedColumns = defaultSummarizedColumns.getOrElse(summarizeColumns.setName("summarizedColumns").cache())
 74 | 
 75 |   lazy val columnIndexOf = this.schema.fieldNames.zipWithIndex.toSet.toMap
 76 | 
 77 |   private def summarizeColumns = {
 78 |     val fieldsTuple = schemaRdd.schema.fields.zipWithIndex.partition(f => f._1.dataType == StringType)
 79 |     val (stringFields, nonStringFields) = (fieldsTuple._1.map(_._2), fieldsTuple._2.map(_._2))
 80 |     val valuex = schemaRdd.flatMap {
 81 |       row =>
 82 |         stringFields.map {
 83 |           sf =>
 84 |             (sf, TreeSet(row.getString(sf)))
 85 |         }
 86 |     }.reduceByKey(_ ++ _)
 87 |     val stringFieldsRdd: RDD[(Int, (Int, (Any => Int), (Any => Double)))] = valuex.map {
 88 |       case (index, values) => (index ->(values.size, values.zipWithIndex.map(f => (f._1, f._2)).toMap, ((_: Any) => 1.0)))
 89 |     }
 90 |     val nonStringMap: Seq[(Int, (Int, (Any => Int), (Any => Double)))] = nonStringFields.map { f => (f, (1, ((_: Any) => 0), ((DataTransformer.toDouble _))))}
 91 |     stringFieldsRdd.union(stringFieldsRdd.context.parallelize(nonStringMap))
 92 |   }
 93 | 
 94 |   lazy val summarizedColumnsIndex = summarizeColumnsIndex
 95 | 
 96 |   private def summarizeColumnsIndex = {
 97 |     val summarized = summarizedColumns.sortBy(_._1).map {
 98 |       f =>
 99 |         f._2._2 match {
100 |           case m: Map[Any, Int] => (f._1, m.map(value => value._2 ->(this.schema.fieldNames(f._1), value._1.toString)))
101 |           case _: (Any => Int) => (f._1, Map(0 ->(this.schema.fieldNames(f._1), "")))
102 |         }
103 |     }.collect
104 |     summarized.foldLeft(Map.empty[Int, (String, String)])((b, a) =>
105 |       (b ++ a._2.map(f => (f._1 + b.size -> f._2))))
106 |   }
107 | 
108 |   var labels = Seq(0)
109 | 
110 |   def applyColumnNames(columnNames: Seq[String]) = {
111 |     val structFields = schemaRdd.schema.fields.zip(columnNames).map {
112 |       case (structField, columnName) =>
113 |         new StructField(columnName, structField.dataType, structField.nullable)
114 |     }
115 |     val newSchemaStruct = StructType(structFields)
116 |     val newSchemaRDD = schemaRdd.sqlContext.applySchema(schemaRdd, newSchemaStruct)
117 |     newSchemaRDD.name = this.name
118 |     new Dataset(newSchemaRDD, Some(this))
119 |   }
120 | 
121 |   def applyColumnTypes(columnTypes: Seq[DataType]) = {
122 |     val structFields = schemaRdd.schema.fields.zip(columnTypes).map {
123 |       case (structField, dataType) =>
124 |         new StructField(structField.name, dataType, structField.nullable)
125 |     }
126 |     val newSchema = StructType(structFields)
127 |     val newRowRDD = convert(schemaRdd, newSchema)
128 | 
129 |     val newSchemaRDD = schemaRdd.sqlContext.applySchema(newRowRDD, newSchema)
130 |     newSchemaRDD.name = this.name
131 |     new Dataset(newSchemaRDD, Some(this))
132 |   }
133 | 
134 |   def columnTypes(): Seq[DataType] = {
135 |     schemaRdd.schema.fields.map(_.dataType)
136 |   }
137 | 
138 |   def columnNames(): Seq[String] = {
139 |     schemaRdd.schema.fields.map(_.name)
140 |   }
141 | 
142 |   private def convert(rowRdd: RDD[Row], newSchema: StructType): RDD[Row] = {
143 |     rowRdd.map { row =>
144 |       val values = row.zip(newSchema.fields).map {
145 |         case (null, _) => null
146 |         case (value: Double, StructField(_, DoubleType, _, _)) => value
147 |         case (value: BigDecimal, StructField(_, DecimalType(), _, _)) => value
148 |         case (value: Timestamp, StructField(_, TimestampType, _, _)) => value
149 |         case (value: Long, StructField(_, LongType, _, _)) => value
150 |         case (value: Int, StructField(_, IntegerType, _, _)) => value
151 |         case (value: Short, StructField(_, ShortType, _, _)) => value
152 |         case (value: Boolean, StructField(_, BooleanType, _, _)) => value
153 |         case (value, StructField(_, DecimalType(), _, _)) => BigDecimal(value.toString)
154 |         case (value, StructField(_, DoubleType, _, _)) => value.toString.toDouble
155 |         case (value, StructField(_, LongType, _, _)) => value.toString.toLong
156 |         case (value, StructField(_, IntegerType, _, _)) => value.toString.toInt
157 |         case (value, StructField(_, ShortType, _, _)) => value.toString.toShort
158 |         //converter de double
159 |         case (value, StructField(_, BooleanType, _, _)) => value.toString match {
160 |           case "1" | "t" | "true" => true
161 |           case "0" | "f" | "false" => false
162 |           case a => throw new InvalidDataException(s"$a is an invalid Boolean value")
163 |         }
164 |         case (value, StructField(_, TimestampType, _, _)) => new Timestamp(DateTimeParser.parse(value.toString).map(_.toDate.getTime).getOrElse(throw new InvalidDataException("Unsupported data format Exception, please specify the date format")))
165 |         case (value, StructField(_, StringType, _, _)) => value.toString
166 |       }
167 |       Row(values: _*)
168 |     }
169 |   }
170 | 
171 |   def sliceByName(includes: Seq[String] = (schemaRdd.schema.fields.map(_.name)), excludes: Seq[String] = Seq[String]()): Dataset = {
172 |     val includesIndices = schemaRdd.schema.fields.zipWithIndex.collect {
173 |       case (structField, index) if (includes.contains(structField.name) && !excludes.contains(structField.name)) => index
174 |     }
175 |     slice(includesIndices, Seq[Int]())
176 |   }
177 | 
178 |   def slice(includes: Seq[Int] = (0 to schemaRdd.schema.fields.size), excludes: Seq[Int] = Seq.empty[Int]): Dataset = {
179 | 
180 |     val fields = schemaRdd.schema.fields.zipWithIndex.collect {
181 |       case (structField, index) if (includes.contains(index) && !excludes.contains(index)) => structField.name;
182 |     }
183 |     import schemaRdd.sqlContext.symbolToUnresolvedAttribute
184 |     val filtered = fields.map(x => symbolToUnresolvedAttribute(Symbol(x)))
185 |     val newSchemaRdd = schemaRdd.select(filtered: _*)
186 |     new Dataset(newSchemaRdd, None)
187 |   }
188 | 
189 |   def toSchemaRDD(): SchemaRDD = schemaRdd
190 | 
191 |   def toLabeledPoint = {
192 |     DataTransformer.createLabeledPointFromRDD(schemaRdd, labels, summarizedColumns, DataSetType.Test, columnsSize - 1).values
193 |   }
194 | 
195 |   def formatDateValues(index: Int, dateSplitter: Long): SchemaRDD = {
196 |     val rdd = schemaRdd.map { f =>
197 |       val (before, after) = f.toSeq.splitAt(index)
198 |       val formattedDate = splitDateValues(f, index, dateSplitter)
199 |       Row(before ++ formattedDate ++ after.headOption.map(_ => after.tail).getOrElse(Seq.empty): _*)
200 |     }
201 |     val (beforeFields, afterFields) = schemaRdd.schema.fields.splitAt(index)
202 |     val dateFields = (1 to determineSizeOfSplitter(dateSplitter)).map(index => new StructField(afterFields.head.name + index, IntegerType, false))
203 |     val fields = beforeFields ++ dateFields ++ afterFields.headOption.map(_ => afterFields.tail).getOrElse(Seq.empty)
204 |     val newSchema = StructType(fields)
205 |     val newRowRDD = convert(rdd, newSchema)
206 | 
207 |     val newSchemaRDD = schemaRdd.sqlContext.applySchema(newRowRDD, newSchema)
208 |     newSchemaRDD.name = this.name
209 |     new Dataset(newSchemaRDD, Some(this))
210 |   }
211 | 
212 |   type DateSplitterColumnSize = (Long, Long, Int) => Int
213 |   type NoSplitterColumnSize = (Long, Int) => Int
214 | 
215 |   private def splitVerifier: DateSplitterColumnSize = (dateSplitter: Long, verifier: Long, value: Int) =>
216 |     if (contains(dateSplitter, verifier)) {
217 |       value + 1
218 |     } else value
219 | 
220 |   private def noSplit: NoSplitterColumnSize = (dateSplitter: Long, value: Int) =>
221 |     if (contains(dateSplitter, NoSplit)) {
222 |       0
223 |     } else value
224 | 
225 |   private def determineSizeOfSplitter(dateSplitter: Long) =
226 |     splitVerifier(dateSplitter, Period,
227 |       splitVerifier(dateSplitter, DayOfAWeek,
228 |         noSplit(dateSplitter, 0)
229 |       )
230 |     )
231 | 
232 | 
233 |   val dayZero = new DateTime(1970, 1, 1, 0, 0, 0)
234 | 
235 |   type DateTimeToInt = DateTime => Int
236 | 
237 |   type RowDateSplitter = (Long, DateTimeToInt, Seq[Int]) => Seq[Int]
238 | 
239 |   val daysBetween: DateTimeToInt = {
240 |     case d: DateTime => Days.daysBetween(dayZero, d).getDays
241 |   }
242 |   val getDayOfAWeek: DateTimeToInt = {
243 |     case d: DateTime => d.getDayOfWeek
244 |   }
245 |   val period: DateTimeToInt = {
246 |     case d: DateTime => DateTimeParser.period(d).id
247 |   }
248 | 
249 |   protected def splitDateValues(line: Row, index: Int, dateSplitter: Long) = {
250 |     def splitDateValues: RowDateSplitter = {
251 |       (verifier: Long, datetimefunc: DateTimeToInt, seq: Seq[Int]) =>
252 |         if (contains(dateSplitter, verifier)) {
253 |           val dateValue = if (line.isNullAt(index)) dayZero else new DateTime(line(index).asInstanceOf[Timestamp].getTime, DateTimeZone.UTC)
254 |           seq ++ Seq(datetimefunc(dateValue))
255 |         } else seq
256 |     }
257 |     splitDateValues(Period, period, splitDateValues(DayOfAWeek, getDayOfAWeek, splitDateValues(NoSplit, daysBetween, Seq.empty[Int])))
258 |   }
259 | 
260 | 
261 | 
262 | 
263 |   def translateCorrelation(array: Array[(Double, Int)]) = {
264 |     array.map {
265 |       f => summarizedColumns.map {
266 |         g => g
267 |       }
268 |     }
269 |   }
270 | }
271 | 
272 | class FileDataset protected[data](@transient uc: SparkNotebookContext, file: String, separator: String = ",", header: Option[String] = None) extends Serializable {
273 | 
274 |   lazy val numberOfPartitions = 4 * (ClusterSettings.getNumberOfCores)
275 | 
276 |   lazy val columnTypes: Array[DataType] = typeLine.map(dataType)
277 | 
278 |   lazy val typeLine: Array[String] = extractFirstCompleteLine(originalRdd)
279 | 
280 |   lazy val columnNames: Array[String] = headerOrFirstLine().split(separator, -1)
281 | 
282 |   lazy val firstLine: String = loadedRDD.first
283 | 
284 |   lazy val loadedRDD = {
285 |     println(s"localFileName:$localFileName")
286 |     val file = uc.sparkContext.textFile(localFileName)
287 |     println("file")
288 |     file
289 |   }
290 | 
291 |   lazy val localFileName: String = {
292 |     uc.sparkContext() // make sure that the cluster is up
293 |     val uri = Some(new URI(file))
294 |     val destURI = uri.filter { f => f.getScheme() != null && f.getScheme().startsWith("s3")}.map { vl =>
295 |       val destURI = s"hdfs:///tmp${vl.getPath()}"
296 |       uc.copy(file, destURI)
297 |       destURI
298 |     }.getOrElse(file)
299 |     destURI
300 |   }
301 | 
302 |   lazy val originalRdd: RDD[Array[String]] = initOriginalRdd(headerOrFirstLine(), localFileName)
303 | 
304 |   def headerOrFirstLine(): String = {
305 |     header.getOrElse(firstLine)
306 |   }
307 | 
308 |   def initOriginalRdd(header: String, rdd: RDD[String]): RDD[Array[String]] = {
309 |     val localHeader = header
310 |     val oRdd = rdd.filter(line => line != localHeader).map(_.split(separator, -1))
311 |     oRdd.setName(localFileName)
312 |     oRdd.cache
313 | 
314 |   }
315 | 
316 |   def initOriginalRdd(header: String, localFileName: String): RDD[Array[String]] = {
317 |     initOriginalRdd(header, loadedRDD)
318 |   }
319 | 
320 |   private def dataType(data: String): DataType = {
321 |     val double = """[+-]?\d*\.?\d*E?\d{1,4}"""
322 |     val intNumber = "-?\\d{1,9}" // more then 9 it cannot be int
323 |     val longNumber = "-?\\d{10,18}" // more then 19 it cannot be long
324 |     if (data.matches(intNumber))
325 |       LongType // TODO: To return IntType the whole data set (or sample) needs to be analyzed.
326 |     else if (data.matches(longNumber))
327 |       LongType
328 |     else if (data.matches(double))
329 |       DecimalType()
330 |     else
331 |       parse(data).getOrElse(StringType)
332 |   }
333 | 
334 |   protected def parse(data: String): Option[DataType] = DateTimeParser.isValidDate(data) match {
335 |     case true => Some(TimestampType)
336 |     case false => None
337 |   }
338 | 
339 |   lazy val schemaRDD: SchemaRDD = initSchemaRDD(columnNames, originalRdd, structType)
340 | 
341 | 
342 |   protected def initSchemaRDD(columnNames: Array[String], originalRdd: RDD[Array[String]], structType: StructType): SchemaRDD = {
343 | 
344 |     val sqlContext = uc.sqlContext
345 |     val colNames = columnNames
346 |     val rowRdd = originalRdd.map { colValues =>
347 |       if (colValues.size != colNames.size) throw new UnexpectedFileFormatException(s"Files should have the same number of columns. Line ${colValues.mkString(",")} \n has #${colValues.size} and Header have #${colNames.size}")
348 |       val columns = colValues.zip(structType.fields).zipWithIndex.map { case ((value, tp), index) =>
349 |         //TODO não converter a data aqui
350 |         tp.dataType match {
351 |           case DecimalType() | DoubleType => value.headOption.map(f => BigDecimal(value.trim)).getOrElse(throw new UnexpectedFileFormatException(s"Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}"))
352 |           case LongType => value.headOption.map(f => value.trim.toLong).getOrElse(throw new UnexpectedFileFormatException(s"Long Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}"))
353 |           case IntegerType => value.headOption.map(f => value.trim.toInt).getOrElse(throw new UnexpectedFileFormatException(s"Int Numeric columns can't be empty.\nIndex $index is empty at: ${colValues.mkString(",")}"))
354 |           case TimestampType => new Timestamp(DateTimeParser.parse(value).map(_.toDate.getTime).getOrElse(0))
355 |           case _ => if (value.trim.isEmpty) "0" else value
356 |         }
357 |       }
358 |       Row(columns: _*)
359 |     }
360 |     val schema = sqlContext.applySchema(rowRdd, structType)
361 |     val tableName = extractTableName(file)
362 |     schema.name = tableName
363 |     schema.registerTempTable(tableName)
364 |     schema.repartition(numberOfPartitions)
365 |     schema
366 |   }
367 | 
368 |   protected def structType(): StructType = {
369 |     if (columnNames.size != typeLine.size || columnNames.size == 0) StructType(List.empty[StructField])
370 |     else {
371 |       val fields = columnNames.zip(columnTypes).map { case (columnName, columnType) => new StructField(columnName, columnType, true)}
372 |       StructType(fields)
373 |     }
374 |   }
375 | 
376 |   protected def extractFirstCompleteLine(dataRdd: RDD[Array[String]]): Array[String] = {
377 |     val x = dataRdd.filter { f =>
378 |       !f.isEmpty &&
379 |         f.forall(!_.isEmpty)
380 |     }.first
381 |     x
382 |   }
383 | 
384 |   protected def extractTableName(file: String): String = {
385 |     val name = file.split("/").last
386 |     val index = name.indexOf(".csv") + name.indexOf(".txt")
387 |     name.splitAt(index + 1).productIterator.toList.filter(!_.toString.isEmpty).head.toString
388 |   }
389 | 
390 |   def header(newHeader: String) = {
391 |     new FileDataset(uc, file, separator, Some(newHeader))
392 |   }
393 | 
394 |   def toSchemaRDD = schemaRDD
395 | 
396 |   def toDataset(): Dataset = {
397 |     new Dataset(schemaRDD)
398 |   }
399 | }
400 | 


--------------------------------------------------------------------------------
/ansible/inventory/hosts:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | '''
  4 | EC2 external inventory script
  5 | =================================
  6 | 
  7 | Generates inventory that Ansible can understand by making API request to
  8 | AWS EC2 using the Boto library.
  9 | 
 10 | NOTE: This script assumes Ansible is being executed where the environment
 11 | variables needed for Boto have already been set:
 12 |     export AWS_ACCESS_KEY_ID='AK123'
 13 |     export AWS_SECRET_ACCESS_KEY='abc123'
 14 | 
 15 | This script also assumes there is an ec2.ini file alongside it.  To specify a
 16 | different path to ec2.ini, define the EC2_INI_PATH environment variable:
 17 | 
 18 |     export EC2_INI_PATH=/path/to/my_ec2.ini
 19 | 
 20 | If you're using eucalyptus you need to set the above variables and
 21 | you need to define:
 22 | 
 23 |     export EC2_URL=http://hostname_of_your_cc:port/services/Eucalyptus
 24 | 
 25 | For more details, see: http://docs.pythonboto.org/en/latest/boto_config_tut.html
 26 | 
 27 | When run against a specific host, this script returns the following variables:
 28 |  - ec2_ami_launch_index
 29 |  - ec2_architecture
 30 |  - ec2_association
 31 |  - ec2_attachTime
 32 |  - ec2_attachment
 33 |  - ec2_attachmentId
 34 |  - ec2_client_token
 35 |  - ec2_deleteOnTermination
 36 |  - ec2_description
 37 |  - ec2_deviceIndex
 38 |  - ec2_dns_name
 39 |  - ec2_eventsSet
 40 |  - ec2_group_name
 41 |  - ec2_hypervisor
 42 |  - ec2_id
 43 |  - ec2_image_id
 44 |  - ec2_instanceState
 45 |  - ec2_instance_type
 46 |  - ec2_ipOwnerId
 47 |  - ec2_ip_address
 48 |  - ec2_item
 49 |  - ec2_kernel
 50 |  - ec2_key_name
 51 |  - ec2_launch_time
 52 |  - ec2_monitored
 53 |  - ec2_monitoring
 54 |  - ec2_networkInterfaceId
 55 |  - ec2_ownerId
 56 |  - ec2_persistent
 57 |  - ec2_placement
 58 |  - ec2_platform
 59 |  - ec2_previous_state
 60 |  - ec2_private_dns_name
 61 |  - ec2_private_ip_address
 62 |  - ec2_publicIp
 63 |  - ec2_public_dns_name
 64 |  - ec2_ramdisk
 65 |  - ec2_reason
 66 |  - ec2_region
 67 |  - ec2_requester_id
 68 |  - ec2_root_device_name
 69 |  - ec2_root_device_type
 70 |  - ec2_security_group_ids
 71 |  - ec2_security_group_names
 72 |  - ec2_shutdown_state
 73 |  - ec2_sourceDestCheck
 74 |  - ec2_spot_instance_request_id
 75 |  - ec2_state
 76 |  - ec2_state_code
 77 |  - ec2_state_reason
 78 |  - ec2_status
 79 |  - ec2_subnet_id
 80 |  - ec2_tenancy
 81 |  - ec2_virtualization_type
 82 |  - ec2_vpc_id
 83 | 
 84 | These variables are pulled out of a boto.ec2.instance object. There is a lack of
 85 | consistency with variable spellings (camelCase and underscores) since this
 86 | just loops through all variables the object exposes. It is preferred to use the
 87 | ones with underscores when multiple exist.
 88 | 
 89 | In addition, if an instance has AWS Tags associated with it, each tag is a new
 90 | variable named:
 91 |  - ec2_tag_[Key] = [Value]
 92 | 
 93 | Security groups are comma-separated in 'ec2_security_group_ids' and
 94 | 'ec2_security_group_names'.
 95 | '''
 96 | 
 97 | # (c) 2012, Peter Sankauskas
 98 | #
 99 | # This file is part of Ansible,
100 | #
101 | # Ansible is free software: you can redistribute it and/or modify
102 | # it under the terms of the GNU General Public License as published by
103 | # the Free Software Foundation, either version 3 of the License, or
104 | # (at your option) any later version.
105 | #
106 | # Ansible is distributed in the hope that it will be useful,
107 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
108 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
109 | # GNU General Public License for more details.
110 | #
111 | # You should have received a copy of the GNU General Public License
112 | # along with Ansible.  If not, see <http://www.gnu.org/licenses/>.
113 | 
114 | ######################################################################
115 | 
116 | import sys
117 | import os
118 | import argparse
119 | import re
120 | from time import time
121 | import boto
122 | from boto import ec2
123 | from boto import rds
124 | from boto import route53
125 | import ConfigParser
126 | 
127 | try:
128 |     import json
129 | except ImportError:
130 |     import simplejson as json
131 | 
132 | 
133 | class Ec2Inventory(object):
134 |     def _empty_inventory(self):
135 |         return {"_meta" : {"hostvars" : {}}}
136 | 
137 |     def __init__(self):
138 |         ''' Main execution path '''
139 | 
140 |         # Inventory grouped by instance IDs, tags, security groups, regions,
141 |         # and availability zones
142 |         self.inventory = self._empty_inventory()
143 | 
144 |         # Index of hostname (address) to instance ID
145 |         self.index = {}
146 | 
147 |         # Read settings and parse CLI arguments
148 |         self.read_settings()
149 |         self.parse_cli_args()
150 | 
151 |         # Cache
152 |         if self.args.refresh_cache:
153 |             self.do_api_calls_update_cache()
154 |         elif not self.is_cache_valid():
155 |             self.do_api_calls_update_cache()
156 | 
157 |         # Data to print
158 |         if self.args.host:
159 |             data_to_print = self.get_host_info()
160 | 
161 |         elif self.args.list:
162 |             # Display list of instances for inventory
163 |             if self.inventory == self._empty_inventory():
164 |                 data_to_print = self.get_inventory_from_cache()
165 |             else:
166 |                 data_to_print = self.json_format_dict(self.inventory, True)
167 | 
168 |         print data_to_print
169 | 
170 | 
171 |     def is_cache_valid(self):
172 |         ''' Determines if the cache files have expired, or if it is still valid '''
173 | 
174 |         if os.path.isfile(self.cache_path_cache):
175 |             mod_time = os.path.getmtime(self.cache_path_cache)
176 |             current_time = time()
177 |             if (mod_time + self.cache_max_age) > current_time:
178 |                 if os.path.isfile(self.cache_path_index):
179 |                     return True
180 | 
181 |         return False
182 | 
183 | 
184 |     def read_settings(self):
185 |         ''' Reads the settings from the ec2.ini file '''
186 | 
187 |         config = ConfigParser.SafeConfigParser()
188 |         ec2_default_ini_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'ec2.ini')
189 |         ec2_ini_path = os.environ.get('EC2_INI_PATH', ec2_default_ini_path)
190 |         config.read(ec2_ini_path)
191 | 
192 |         # is eucalyptus?
193 |         self.eucalyptus_host = None
194 |         self.eucalyptus = False
195 |         if config.has_option('ec2', 'eucalyptus'):
196 |             self.eucalyptus = config.getboolean('ec2', 'eucalyptus')
197 |         if self.eucalyptus and config.has_option('ec2', 'eucalyptus_host'):
198 |             self.eucalyptus_host = config.get('ec2', 'eucalyptus_host')
199 | 
200 |         # Regions
201 |         self.regions = []
202 |         configRegions = config.get('ec2', 'regions')
203 |         configRegions_exclude = config.get('ec2', 'regions_exclude')
204 |         if (configRegions == 'all'):
205 |             if self.eucalyptus_host:
206 |                 self.regions.append(boto.connect_euca(host=self.eucalyptus_host).region.name)
207 |             else:
208 |                 for regionInfo in ec2.regions():
209 |                     if regionInfo.name not in configRegions_exclude:
210 |                         self.regions.append(regionInfo.name)
211 |         else:
212 |             self.regions = configRegions.split(",")
213 | 
214 |         # Destination addresses
215 |         self.destination_variable = config.get('ec2', 'destination_variable')
216 |         self.vpc_destination_variable = config.get('ec2', 'vpc_destination_variable')
217 | 
218 |         # Route53
219 |         self.route53_enabled = config.getboolean('ec2', 'route53')
220 |         self.route53_excluded_zones = []
221 |         if config.has_option('ec2', 'route53_excluded_zones'):
222 |             self.route53_excluded_zones.extend(
223 |                 config.get('ec2', 'route53_excluded_zones', '').split(','))
224 | 
225 |         # Return all EC2/RDS instances
226 |         if config.has_option('ec2', 'all_instances'):
227 |             self.all_instances = config.getboolean('ec2', 'all_instances')
228 |         else:
229 |             self.all_instances = False
230 |         if config.has_option('ec2', 'all_rds_instances'):
231 |             self.all_rds_instances = config.getboolean('ec2', 'all_rds_instances')
232 |         else:
233 |             self.all_rds_instances = False
234 | 
235 |         # Cache related
236 |         cache_dir = os.path.expanduser(config.get('ec2', 'cache_path'))
237 |         if not os.path.exists(cache_dir):
238 |             os.makedirs(cache_dir)
239 | 
240 |         self.cache_path_cache = cache_dir + "/ansible-ec2.cache"
241 |         self.cache_path_index = cache_dir + "/ansible-ec2.index"
242 |         self.cache_max_age = config.getint('ec2', 'cache_max_age')
243 | 
244 | 
245 | 
246 |     def parse_cli_args(self):
247 |         ''' Command line argument processing '''
248 | 
249 |         parser = argparse.ArgumentParser(description='Produce an Ansible Inventory file based on EC2')
250 |         parser.add_argument('--list', action='store_true', default=True,
251 |                            help='List instances (default: True)')
252 |         parser.add_argument('--host', action='store',
253 |                            help='Get all the variables about a specific instance')
254 |         parser.add_argument('--refresh-cache', action='store_true', default=False,
255 |                            help='Force refresh of cache by making API requests to EC2 (default: False - use cache files)')
256 |         self.args = parser.parse_args()
257 | 
258 | 
259 |     def do_api_calls_update_cache(self):
260 |         ''' Do API calls to each region, and save data in cache files '''
261 | 
262 |         if self.route53_enabled:
263 |             self.get_route53_records()
264 | 
265 |         for region in self.regions:
266 |             self.get_instances_by_region(region)
267 |             self.get_rds_instances_by_region(region)
268 | 
269 |         self.write_to_cache(self.inventory, self.cache_path_cache)
270 |         self.write_to_cache(self.index, self.cache_path_index)
271 | 
272 | 
273 |     def get_instances_by_region(self, region):
274 |         ''' Makes an AWS EC2 API call to the list of instances in a particular
275 |         region '''
276 | 
277 |         try:
278 |             if self.eucalyptus:
279 |                 conn = boto.connect_euca(host=self.eucalyptus_host)
280 |                 conn.APIVersion = '2010-08-31'
281 |             else:
282 |                 conn = ec2.connect_to_region(region)
283 | 
284 |             # connect_to_region will fail "silently" by returning None if the region name is wrong or not supported
285 |             if conn is None:
286 |                 print("region name: %s likely not supported, or AWS is down.  connection to region failed." % region)
287 |                 sys.exit(1)
288 | 
289 |             reservations = conn.get_all_instances()
290 |             for reservation in reservations:
291 |                 for instance in reservation.instances:
292 |                     self.add_instance(instance, region)
293 | 
294 |         except boto.exception.BotoServerError, e:
295 |             if  not self.eucalyptus:
296 |                 print "Looks like AWS is down again:"
297 |             print e
298 |             sys.exit(1)
299 | 
300 |     def get_rds_instances_by_region(self, region):
301 |         ''' Makes an AWS API call to the list of RDS instances in a particular
302 |         region '''
303 | 
304 |         try:
305 |             conn = rds.connect_to_region(region)
306 |             if conn:
307 |                 instances = conn.get_all_dbinstances()
308 |                 for instance in instances:
309 |                     self.add_rds_instance(instance, region)
310 |         except boto.exception.BotoServerError, e:
311 |             if not e.reason == "Forbidden":
312 |                 print "Looks like AWS RDS is down: "
313 |                 print e
314 |                 sys.exit(1)
315 | 
316 |     def get_instance(self, region, instance_id):
317 |         ''' Gets details about a specific instance '''
318 |         if self.eucalyptus:
319 |             conn = boto.connect_euca(self.eucalyptus_host)
320 |             conn.APIVersion = '2010-08-31'
321 |         else:
322 |             conn = ec2.connect_to_region(region)
323 | 
324 |         # connect_to_region will fail "silently" by returning None if the region name is wrong or not supported
325 |         if conn is None:
326 |             print("region name: %s likely not supported, or AWS is down.  connection to region failed." % region)
327 |             sys.exit(1)
328 | 
329 |         reservations = conn.get_all_instances([instance_id])
330 |         for reservation in reservations:
331 |             for instance in reservation.instances:
332 |                 return instance
333 | 
334 | 
335 |     def add_instance(self, instance, region):
336 |         ''' Adds an instance to the inventory and index, as long as it is
337 |         addressable '''
338 | 
339 |         # Only want running instances unless all_instances is True
340 |         if not self.all_instances and instance.state != 'running':
341 |             return
342 | 
343 |         # Select the best destination address
344 |         if instance.subnet_id:
345 |             dest = getattr(instance, self.vpc_destination_variable)
346 |         else:
347 |             dest =  getattr(instance, self.destination_variable)
348 | 
349 |         if not dest:
350 |             # Skip instances we cannot address (e.g. private VPC subnet)
351 |             return
352 | 
353 |         # Add to index
354 |         self.index[dest] = [region, instance.id]
355 | 
356 |         # Inventory: Group by instance ID (always a group of 1)
357 |         self.inventory[instance.id] = [dest]
358 | 
359 |         # Inventory: Group by region
360 |         self.push(self.inventory, region, dest)
361 | 
362 |         # Inventory: Group by availability zone
363 |         self.push(self.inventory, instance.placement, dest)
364 | 
365 |         # Inventory: Group by instance type
366 |         self.push(self.inventory, self.to_safe('type_' + instance.instance_type), dest)
367 | 
368 |         # Inventory: Group by key pair
369 |         if instance.key_name:
370 |             self.push(self.inventory, self.to_safe('key_' + instance.key_name), dest)
371 | 
372 |         # Inventory: Group by security group
373 |         try:
374 |             for group in instance.groups:
375 |                 key = self.to_safe("security_group_" + group.name)
376 |                 self.push(self.inventory, key, dest)
377 |         except AttributeError:
378 |             print 'Package boto seems a bit older.'
379 |             print 'Please upgrade boto >= 2.3.0.'
380 |             sys.exit(1)
381 | 
382 |         # Inventory: Group by tag keys
383 |         for k, v in instance.tags.iteritems():
384 |             key = self.to_safe("tag_" + k + "=" + v)
385 |             self.push(self.inventory, key, dest)
386 | 
387 |         # Inventory: Group by Route53 domain names if enabled
388 |         if self.route53_enabled:
389 |             route53_names = self.get_instance_route53_names(instance)
390 |             for name in route53_names:
391 |                 self.push(self.inventory, name, dest)
392 | 
393 |         # Global Tag: tag all EC2 instances
394 |         self.push(self.inventory, 'ec2', dest)
395 | 
396 |         self.inventory["_meta"]["hostvars"][dest] = self.get_host_info_dict_from_instance(instance)
397 | 
398 | 
399 |     def add_rds_instance(self, instance, region):
400 |         ''' Adds an RDS instance to the inventory and index, as long as it is
401 |         addressable '''
402 | 
403 |         # Only want available instances unless all_rds_instances is True
404 |         if not self.all_rds_instances and instance.status != 'available':
405 |             return
406 | 
407 |         # Select the best destination address
408 |         #if instance.subnet_id:
409 |             #dest = getattr(instance, self.vpc_destination_variable)
410 |         #else:
411 |             #dest =  getattr(instance, self.destination_variable)
412 |         dest = instance.endpoint[0]
413 | 
414 |         if not dest:
415 |             # Skip instances we cannot address (e.g. private VPC subnet)
416 |             return
417 | 
418 |         # Add to index
419 |         self.index[dest] = [region, instance.id]
420 | 
421 |         # Inventory: Group by instance ID (always a group of 1)
422 |         self.inventory[instance.id] = [dest]
423 | 
424 |         # Inventory: Group by region
425 |         self.push(self.inventory, region, dest)
426 | 
427 |         # Inventory: Group by availability zone
428 |         self.push(self.inventory, instance.availability_zone, dest)
429 | 
430 |         # Inventory: Group by instance type
431 |         self.push(self.inventory, self.to_safe('type_' + instance.instance_class), dest)
432 | 
433 |         # Inventory: Group by security group
434 |         try:
435 |             if instance.security_group:
436 |                 key = self.to_safe("security_group_" + instance.security_group.name)
437 |                 self.push(self.inventory, key, dest)
438 |         except AttributeError:
439 |             print 'Package boto seems a bit older.'
440 |             print 'Please upgrade boto >= 2.3.0.'
441 |             sys.exit(1)
442 | 
443 |         # Inventory: Group by engine
444 |         self.push(self.inventory, self.to_safe("rds_" + instance.engine), dest)
445 | 
446 |         # Inventory: Group by parameter group
447 |         self.push(self.inventory, self.to_safe("rds_parameter_group_" + instance.parameter_group.name), dest)
448 | 
449 |         # Global Tag: all RDS instances
450 |         self.push(self.inventory, 'rds', dest)
451 | 
452 | 
453 |     def get_route53_records(self):
454 |         ''' Get and store the map of resource records to domain names that
455 |         point to them. '''
456 | 
457 |         r53_conn = route53.Route53Connection()
458 |         all_zones = r53_conn.get_zones()
459 | 
460 |         route53_zones = [ zone for zone in all_zones if zone.name[:-1]
461 |                           not in self.route53_excluded_zones ]
462 | 
463 |         self.route53_records = {}
464 | 
465 |         for zone in route53_zones:
466 |             rrsets = r53_conn.get_all_rrsets(zone.id)
467 | 
468 |             for record_set in rrsets:
469 |                 record_name = record_set.name
470 | 
471 |                 if record_name.endswith('.'):
472 |                     record_name = record_name[:-1]
473 | 
474 |                 for resource in record_set.resource_records:
475 |                     self.route53_records.setdefault(resource, set())
476 |                     self.route53_records[resource].add(record_name)
477 | 
478 | 
479 |     def get_instance_route53_names(self, instance):
480 |         ''' Check if an instance is referenced in the records we have from
481 |         Route53. If it is, return the list of domain names pointing to said
482 |         instance. If nothing points to it, return an empty list. '''
483 | 
484 |         instance_attributes = [ 'public_dns_name', 'private_dns_name',
485 |                                 'ip_address', 'private_ip_address' ]
486 | 
487 |         name_list = set()
488 | 
489 |         for attrib in instance_attributes:
490 |             try:
491 |                 value = getattr(instance, attrib)
492 |             except AttributeError:
493 |                 continue
494 | 
495 |             if value in self.route53_records:
496 |                 name_list.update(self.route53_records[value])
497 | 
498 |         return list(name_list)
499 | 
500 | 
501 |     def get_host_info_dict_from_instance(self, instance):
502 |         instance_vars = {}
503 |         for key in vars(instance):
504 |             value = getattr(instance, key)
505 |             key = self.to_safe('ec2_' + key)
506 | 
507 |             # Handle complex types
508 |             # state/previous_state changed to properties in boto in https://github.com/boto/boto/commit/a23c379837f698212252720d2af8dec0325c9518
509 |             if key == 'ec2__state':
510 |                 instance_vars['ec2_state'] = instance.state or ''
511 |                 instance_vars['ec2_state_code'] = instance.state_code
512 |             elif key == 'ec2__previous_state':
513 |                 instance_vars['ec2_previous_state'] = instance.previous_state or ''
514 |                 instance_vars['ec2_previous_state_code'] = instance.previous_state_code
515 |             elif type(value) in [int, bool]:
516 |                 instance_vars[key] = value
517 |             elif type(value) in [str, unicode]:
518 |                 instance_vars[key] = value.strip()
519 |             elif type(value) == type(None):
520 |                 instance_vars[key] = ''
521 |             elif key == 'ec2_region':
522 |                 instance_vars[key] = value.name
523 |             elif key == 'ec2__placement':
524 |                 instance_vars['ec2_placement'] = value.zone
525 |             elif key == 'ec2_tags':
526 |                 for k, v in value.iteritems():
527 |                     key = self.to_safe('ec2_tag_' + k)
528 |                     instance_vars[key] = v
529 |             elif key == 'ec2_groups':
530 |                 group_ids = []
531 |                 group_names = []
532 |                 for group in value:
533 |                     group_ids.append(group.id)
534 |                     group_names.append(group.name)
535 |                 instance_vars["ec2_security_group_ids"] = ','.join(group_ids)
536 |                 instance_vars["ec2_security_group_names"] = ','.join(group_names)
537 |             else:
538 |                 pass
539 |                 # TODO Product codes if someone finds them useful
540 |                 #print key
541 |                 #print type(value)
542 |                 #print value
543 | 
544 |         return instance_vars
545 | 
546 |     def get_host_info(self):
547 |         ''' Get variables about a specific host '''
548 | 
549 |         if len(self.index) == 0:
550 |             # Need to load index from cache
551 |             self.load_index_from_cache()
552 | 
553 |         if not self.args.host in self.index:
554 |             # try updating the cache
555 |             self.do_api_calls_update_cache()
556 |             if not self.args.host in self.index:
557 |                 # host migh not exist anymore
558 |                 return self.json_format_dict({}, True)
559 | 
560 |         (region, instance_id) = self.index[self.args.host]
561 | 
562 |         instance = self.get_instance(region, instance_id)
563 |         return self.json_format_dict(self.get_host_info_dict_from_instance(instance), True)
564 | 
565 |     def push(self, my_dict, key, element):
566 |         ''' Pushed an element onto an array that may not have been defined in
567 |         the dict '''
568 | 
569 |         if key in my_dict:
570 |             my_dict[key].append(element);
571 |         else:
572 |             my_dict[key] = [element]
573 | 
574 | 
575 |     def get_inventory_from_cache(self):
576 |         ''' Reads the inventory from the cache file and returns it as a JSON
577 |         object '''
578 | 
579 |         cache = open(self.cache_path_cache, 'r')
580 |         json_inventory = cache.read()
581 |         return json_inventory
582 | 
583 | 
584 |     def load_index_from_cache(self):
585 |         ''' Reads the index from the cache file sets self.index '''
586 | 
587 |         cache = open(self.cache_path_index, 'r')
588 |         json_index = cache.read()
589 |         self.index = json.loads(json_index)
590 | 
591 | 
592 |     def write_to_cache(self, data, filename):
593 |         ''' Writes data in JSON format to a file '''
594 | 
595 |         json_data = self.json_format_dict(data, True)
596 |         cache = open(filename, 'w')
597 |         cache.write(json_data)
598 |         cache.close()
599 | 
600 | 
601 |     def to_safe(self, word):
602 |         ''' Converts 'bad' characters in a string to underscores so they can be
603 |         used as Ansible groups '''
604 | 
605 |         return re.sub("[^A-Za-z0-9\-]", "_", word)
606 | 
607 | 
608 |     def json_format_dict(self, data, pretty=False):
609 |         ''' Converts a dict to a JSON object and dumps it as a formatted
610 |         string '''
611 | 
612 |         if pretty:
613 |             return json.dumps(data, sort_keys=True, indent=2)
614 |         else:
615 |             return json.dumps(data)
616 | 
617 | 
618 | # Run the script
619 | Ec2Inventory()
620 | 
621 | 


--------------------------------------------------------------------------------
/src/universal/ec2/spark_ec2.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | 
   4 | from __future__ import with_statement
   5 | 
   6 | import logging
   7 | import os
   8 | import pipes
   9 | import random
  10 | import shutil
  11 | import subprocess
  12 | import sys
  13 | import tempfile
  14 | import time
  15 | import urllib2
  16 | import string
  17 | from optparse import OptionParser
  18 | from sys import stderr
  19 | from string import Template
  20 | import boto
  21 | from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType
  22 | from boto import ec2
  23 | import datetime
  24 | from datetime import datetime
  25 | from datetime import timedelta
  26 | 
  27 | class UsageError(Exception):
  28 |   pass
  29 | 
  30 | DEFAULT_SPARK_VERSION = "1.2.0"
  31 | SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
  32 | MESOS_SPARK_EC2_BRANCH = "v4"
  33 | # A URL prefix from which to fetch AMI information
  34 | AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
  35 | 
  36 | 
  37 | # Configure and parse our command-line arguments
  38 | def parse_args():
  39 |   parser = OptionParser(usage="spark-ec2 [options] <action> <cluster_name>"
  40 |       + "\n\n<action> can be: launch, destroy, login, stop, start, get-master",
  41 |       add_help_option=False)
  42 |   parser.add_option("-h", "--help", action="help",
  43 |                     help="Show this help message and exit")
  44 |   parser.add_option("-s", "--slaves", type="int", default=1,
  45 |       help="Number of slaves to launch (default: 1)")
  46 |   parser.add_option("-w", "--wait", type="int", default=120,
  47 |       help="Seconds to wait for nodes to start (default: 120)")
  48 |   parser.add_option("-k", "--key-pair",
  49 |       help="Key pair to use on instances")
  50 |   parser.add_option("-i", "--identity-file",
  51 |       help="SSH private key file to use for logging into instances")
  52 |   parser.add_option("-t", "--instance-type", default="m1.large",
  53 |       help="Type of instance to launch (default: m1.large). " +
  54 |            "WARNING: must be 64-bit; small instances won't work")
  55 |   parser.add_option("-m", "--master-instance-type", default="",
  56 |       help="Master instance type (leave empty for same as instance-type)")
  57 |   parser.add_option("-r", "--region", help="EC2 region zone to launch instances in")
  58 |   parser.add_option("-z", "--zone", help="Availability zone to launch instances in, or 'all' to spread " +
  59 |            "slaves across multiple (an additional $0.01/Gb for bandwidth" +
  60 |            "between zones applies)")
  61 |   parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
  62 |   parser.add_option("-p", "--profile", help="AWS profile/role arn to use")
  63 |   parser.add_option("-v", "--spark-version", default=DEFAULT_SPARK_VERSION,
  64 |       help="Version of Spark to use: 'X.Y.Z' or a specific git hash")
  65 |   parser.add_option("--spark-git-repo",
  66 |       default="https://github.com/apache/spark",
  67 |       help="Github repo from which to checkout supplied commit hash")
  68 |   parser.add_option("--hadoop-major-version", default="2",
  69 |       help="Major version of Hadoop (default: 2)")
  70 |   parser.add_option("-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
  71 |       help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
  72 |             "the given local address (for use with login)")
  73 |   parser.add_option("--resume", action="store_true", default=False,
  74 |       help="Resume installation on a previously launched cluster " +
  75 |            "(for debugging)")
  76 |   parser.add_option("--ebs-vol-size", metavar="SIZE", type="int", default=0,
  77 |       help="Attach a new EBS volume of size SIZE (in GB) to each node as " +
  78 |            "/vol. The volumes will be deleted when the instances terminate. " +
  79 |            "Only possible on EBS-backed AMIs.")
  80 |   parser.add_option("--swap", metavar="SWAP", type="int", default=1024,
  81 |       help="Swap space to set up per node, in MB (default: 1024)")
  82 |   parser.add_option("--spot-price", metavar="PRICE", type="float",
  83 |       help="If specified, launch slaves as spot instances with the given " +
  84 |             "maximum price (in dollars)")
  85 |   parser.add_option("--ganglia", action="store_true", default=True,
  86 |       help="Setup Ganglia monitoring on cluster (default: on). NOTE: " +
  87 |            "the Ganglia page will be publicly accessible")
  88 |   parser.add_option("--no-ganglia", action="store_false", dest="ganglia",
  89 |       help="Disable Ganglia monitoring for the cluster")
  90 |   parser.add_option("-u", "--user", default="root",
  91 |       help="The SSH user you want to connect as (default: root)")
  92 |   parser.add_option("--delete-groups", action="store_true", default=False,
  93 |       help="When destroying a cluster, delete the security groups that were created")
  94 |   parser.add_option("--use-existing-master", action="store_true", default=False,
  95 |       help="Launch fresh slaves, but use an existing stopped master if possible")
  96 |   parser.add_option("--worker-instances", type="int", default=1,
  97 |       help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: 1)")
  98 |   parser.add_option("--master-opts", type="string", default="",
  99 |       help="Extra options to give to master through SPARK_MASTER_OPTS variable (e.g -Dspark.worker.timeout=180)")
 100 | 
 101 |   (opts, args) = parser.parse_args()
 102 |   if len(args) != 2:
 103 |     parser.print_help()
 104 |     sys.exit(1)
 105 |   (action, cluster_name) = args
 106 | 
 107 |   if opts.region is None:
 108 |     opts.region = region()
 109 | 
 110 |   if opts.zone is None:
 111 |     opts.zone = zone()
 112 | 
 113 |   # Boto config check
 114 |   # http://boto.cloudhackers.com/en/latest/boto_config_tut.html
 115 |   # home_dir = os.getenv('HOME')
 116 |   # if home_dir == None or not os.path.isfile(home_dir + '/.boto'):
 117 |   #   if not os.path.isfile('/etc/boto.cfg'):
 118 |   #     if os.getenv('AWS_ACCESS_KEY_ID') == None:
 119 |   #       print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
 120 |   #                         "must be set")
 121 |   #       sys.exit(1)
 122 |   #     if os.getenv('AWS_SECRET_ACCESS_KEY') == None:
 123 |   #       print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
 124 |   #                         "must be set")
 125 |   #       sys.exit(1)
 126 |   return (opts, action, cluster_name)
 127 | 
 128 | 
 129 | # Get the EC2 security group of the given name, creating it if it doesn't exist
 130 | def get_or_make_group(conn, name):
 131 |   groups = conn.get_all_security_groups()
 132 |   group = [g for g in groups if g.name == name]
 133 |   if len(group) > 0:
 134 |     return group[0]
 135 |   else:
 136 |     print "Creating security group " + name
 137 |     return conn.create_security_group(name, "Spark EC2 group")
 138 | 
 139 | 
 140 | # Wait for a set of launched instances to exit the "pending" state
 141 | # (i.e. either to start running or to fail and be terminated)
 142 | def wait_for_instances(conn, instances):
 143 |   ids = [i.id for i in instances]
 144 |   while True:
 145 |     # for i in instances:
 146 |     #   i.update()
 147 |       # if len([i for i in instances if i.state == 'pending']) > 0:
 148 |   #
 149 |     instace_stati = conn.get_all_instance_status(instance_ids=ids)
 150 |     if len([i for i in instace_stati if i.system_status.details['reachability'] != 'passed' or i.instance_status.details['reachability'] != 'passed']) > 0:
 151 |         time.sleep(5)
 152 |     else:
 153 |       return
 154 | 
 155 | 
 156 | # Check whether a given EC2 instance object is in a state we consider active,
 157 | # i.e. not terminating or terminated. We count both stopping and stopped as
 158 | # active since we can restart stopped clusters.
 159 | def is_active(instance):
 160 |   return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
 161 | 
 162 | # Return correct versions of Spark and Shark, given the supplied Spark version
 163 | def get_spark_shark_version(opts):
 164 |   spark_shark_map = {
 165 |     "0.7.3": "0.7.1",
 166 |     "0.8.0": "0.8.0",
 167 |     "0.8.1": "0.8.1",
 168 |     "0.9.0": "0.9.0",
 169 |     "0.9.1": "0.9.1",
 170 |     "1.0.0": "1.0.0",
 171 |     "1.0.1": "1.0.1",
 172 |     "1.0.2": "1.0.2",
 173 |     "1.1.0": "1.1.0",
 174 |     "1.2.0": "1.2.0"
 175 |   }
 176 |   version = opts.spark_version.replace("v", "")
 177 |   if version not in spark_shark_map:
 178 |     print >> stderr, "Don't know about Spark version: %s" % version
 179 |     sys.exit(1)
 180 |   return (version, spark_shark_map[version])
 181 | 
 182 | # Attempt to resolve an appropriate AMI given the architecture and
 183 | # region of the request.
 184 | # Information regarding Amazon Linux AMI instance type was updated on 2014-6-20:
 185 | # http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
 186 | def get_spark_ami(opts):
 187 |   instance_types = {
 188 |     "c1.medium": "pvm",
 189 |     "c1.xlarge": "pvm",
 190 |     "c3.2xlarge": "pvm",
 191 |     "c3.4xlarge": "pvm",
 192 |     "c3.8xlarge": "pvm",
 193 |     "c3.large": "pvm",
 194 |     "c3.xlarge": "pvm",
 195 |     "cc1.4xlarge": "hvm",
 196 |     "cc2.8xlarge": "hvm",
 197 |     "cg1.4xlarge": "hvm",
 198 |     "cr1.8xlarge": "hvm",
 199 |     "hi1.4xlarge": "pvm",
 200 |     "hs1.8xlarge": "pvm",
 201 |     "i2.2xlarge": "hvm",
 202 |     "i2.4xlarge": "hvm",
 203 |     "i2.8xlarge": "hvm",
 204 |     "i2.xlarge": "hvm",
 205 |     "m1.large": "pvm",
 206 |     "m1.medium": "pvm",
 207 |     "m1.small": "pvm",
 208 |     "m1.xlarge": "pvm",
 209 |     "m2.2xlarge": "pvm",
 210 |     "m2.4xlarge": "pvm",
 211 |     "m2.xlarge": "pvm",
 212 |     "m3.2xlarge": "hvm",
 213 |     "m3.large": "hvm",
 214 |     "m3.medium": "hvm",
 215 |     "m3.xlarge": "hvm",
 216 |     "r3.2xlarge": "hvm",
 217 |     "r3.4xlarge": "hvm",
 218 |     "r3.8xlarge": "hvm",
 219 |     "r3.large": "hvm",
 220 |     "r3.xlarge": "hvm",
 221 |     "t1.micro": "pvm",
 222 |     "t2.medium": "hvm",
 223 |     "t2.micro": "hvm",
 224 |     "t2.small": "hvm",
 225 |   }
 226 | 
 227 |   if opts.instance_type in instance_types:
 228 |     instance_type = instance_types[opts.instance_type]
 229 |   else:
 230 |     instance_type = "pvm"
 231 |     print >> stderr,\
 232 |         "Don't recognize %s, assuming type is pvm" % opts.instance_type
 233 | 
 234 |   ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type)
 235 |   try:
 236 |     ami = urllib2.urlopen(ami_path).read().strip()
 237 |     print "Spark AMI: " + ami
 238 |   except:
 239 |     print >> stderr, "Could not resolve AMI at: " + ami_path
 240 |     sys.exit(1)
 241 | 
 242 |   return ami
 243 | 
 244 | # Launch a cluster of the given name, by setting up its security groups,
 245 | # and then starting new instances in them.
 246 | # Returns a tuple of EC2 reservation objects for the master and slaves
 247 | # Fails if there already instances running in the cluster's groups.
 248 | def launch_cluster(conn, opts, cluster_name):
 249 | 
 250 |   #Remove known hosts to avoid "Offending key for IP ..." errors.
 251 |   known_hosts = os.environ['HOME'] + "/.ssh/known_hosts"
 252 |   if os.path.isfile(known_hosts):
 253 |     os.remove(known_hosts)
 254 |   if opts.key_pair is None:
 255 |       opts.key_pair = keypair()
 256 |       if opts.key_pair is None:
 257 |         print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
 258 |         sys.exit(1)
 259 | 
 260 |   if opts.profile is None:
 261 |     opts.profile = profile()
 262 |     if opts.profile is None:
 263 |       print >> stderr, "ERROR: No profile found in current host. It be provided with -p option."
 264 |       sys.exit(1)
 265 | 
 266 |   public_key = pub_key()
 267 |   user_data = Template("""#!/bin/bash
 268 |   set -e -x
 269 |   echo '$public_key' >> ~root/.ssh/authorized_keys
 270 |   echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute(public_key=public_key)
 271 | 
 272 |   print "Setting up security groups..."
 273 |   master_group = get_or_make_group(conn, cluster_name + "-master")
 274 |   slave_group = get_or_make_group(conn, cluster_name + "-slaves")
 275 |   sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication")
 276 |   if master_group.rules == []: # Group was just now created
 277 |     master_group.authorize(src_group=master_group)
 278 |     master_group.authorize(src_group=slave_group)
 279 |     master_group.authorize(src_group=sparknotebook_group)
 280 |     master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
 281 |     master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
 282 |     master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
 283 |     master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
 284 |     master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
 285 |     master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
 286 |     master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
 287 |     master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
 288 |     master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0')
 289 |     if opts.ganglia:
 290 |       master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
 291 |   if slave_group.rules == []: # Group was just now created
 292 |     slave_group.authorize(src_group=master_group)
 293 |     slave_group.authorize(src_group=slave_group)
 294 |     slave_group.authorize(src_group=sparknotebook_group)
 295 |     slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
 296 |     slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
 297 |     slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
 298 |     slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
 299 |     slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
 300 |     slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
 301 | 
 302 |   if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id):
 303 |     sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group)
 304 |     sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group)
 305 | 
 306 |   if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id):
 307 |     sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group)
 308 |     sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group)
 309 | 
 310 |   # Check if instances are already running in our groups
 311 |   existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
 312 |                                                            die_on_error=False)
 313 |   if existing_slaves or (existing_masters and not opts.use_existing_master):
 314 |     print >> stderr,("ERROR: There are already instances running in " +
 315 |         "group %s or %s" % (master_group.name, slave_group.name))
 316 |     sys.exit(1)
 317 | 
 318 |   # Figure out Spark AMI
 319 |   if opts.ami is None:
 320 |     opts.ami = get_spark_ami(opts)
 321 |   print "Launching instances..."
 322 | 
 323 |   try:
 324 |     image = conn.get_all_images(image_ids=[opts.ami])[0]
 325 |   except:
 326 |     print >> stderr,"Could not find AMI " + opts.ami
 327 |     sys.exit(1)
 328 | 
 329 |   # Create block device mapping so that we can add an EBS volume if asked to
 330 |   block_map = BlockDeviceMapping()
 331 |   if opts.ebs_vol_size > 0:
 332 |     device = EBSBlockDeviceType()
 333 |     device.size = opts.ebs_vol_size
 334 |     device.delete_on_termination = True
 335 |     block_map["/dev/sdv"] = device
 336 | 
 337 |   # Launch slaves
 338 |   if opts.spot_price != None:
 339 |     zones = get_zones(conn, opts)
 340 |     
 341 |     num_zones = len(zones)
 342 |     i = 0
 343 |     my_req_ids = []
 344 | 
 345 |     for zone in zones:
 346 |       best_price = find_best_price(conn,opts.instance_type,zone, opts.spot_price)
 347 |       # Launch spot instances with the requested price
 348 |       print >> stderr,("Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" %
 349 |            (opts.slaves, best_price, opts.slaves * best_price))
 350 | 
 351 |       num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
 352 |       interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True)
 353 |       interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
 354 | 
 355 |       slave_reqs = conn.request_spot_instances(
 356 |           price = best_price,
 357 |           image_id = opts.ami,
 358 |           launch_group = "launch-group-%s" % cluster_name,
 359 |           placement = zone,
 360 |           count = num_slaves_this_zone,
 361 |           key_name = opts.key_pair,
 362 |           instance_type = opts.instance_type,
 363 |           block_device_map = block_map,
 364 |           user_data = user_data,
 365 |           instance_profile_arn = opts.profile,
 366 |           network_interfaces = interfaces)
 367 |       my_req_ids += [req.id for req in slave_reqs]
 368 |       i += 1
 369 | 
 370 |     print >> stderr, "Waiting for spot instances to be granted"
 371 |     try:
 372 |       while True:
 373 |         time.sleep(10)
 374 |         reqs = conn.get_all_spot_instance_requests()
 375 |         id_to_req = {}
 376 |         for r in reqs:
 377 |           id_to_req[r.id] = r
 378 |         active_instance_ids = []
 379 |         for i in my_req_ids:
 380 |           if i in id_to_req and id_to_req[i].state == "active":
 381 |             active_instance_ids.append(id_to_req[i].instance_id)
 382 |         if len(active_instance_ids) == opts.slaves:
 383 |           print >> stderr, "All %d slaves granted" % opts.slaves
 384 |           reservations = conn.get_all_instances(active_instance_ids)
 385 |           slave_nodes = []
 386 |           for r in reservations:
 387 |             slave_nodes += r.instances
 388 |           break
 389 |         else:
 390 |           # print >> stderr, ".",
 391 |           print "%d of %d slaves granted, waiting longer" % (
 392 |             len(active_instance_ids), opts.slaves)
 393 |     except:
 394 |       print >> stderr, "Canceling spot instance requests"
 395 |       conn.cancel_spot_instance_requests(my_req_ids)
 396 |       # Log a warning if any of these requests actually launched instances:
 397 |       (master_nodes, slave_nodes) = get_existing_cluster(
 398 |           conn, opts, cluster_name, die_on_error=False)
 399 |       running = len(master_nodes) + len(slave_nodes)
 400 |       if running:
 401 |         print >> stderr,("WARNING: %d instances are still running" % running)
 402 |       sys.exit(0)
 403 |   else:
 404 |     # Launch non-spot instances
 405 |     zones = get_zones(conn, opts)
 406 |     num_zones = len(zones)
 407 |     i = 0
 408 |     slave_nodes = []
 409 |     for zone in zones:
 410 |       num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
 411 |       if num_slaves_this_zone > 0:
 412 |         slave_res = image.run(key_name = opts.key_pair,
 413 |                               security_group_ids = [slave_group.id],
 414 |                               instance_type = opts.instance_type,
 415 |                               subnet_id = subnetId(),
 416 |                               placement = zone,
 417 |                               min_count = num_slaves_this_zone,
 418 |                               max_count = num_slaves_this_zone,
 419 |                               block_device_map = block_map,
 420 |                               user_data = user_data,
 421 |                               instance_profile_arn = opts.profile)
 422 |         slave_nodes += slave_res.instances
 423 |         print >> stderr,"Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
 424 |                                                         zone, slave_res.id)
 425 |       i += 1
 426 | 
 427 |   # Launch or resume masters
 428 |   if existing_masters:
 429 |     print "Starting master..."
 430 |     for inst in existing_masters:
 431 |       if inst.state not in ["shutting-down", "terminated"]:
 432 |         inst.start()
 433 |     master_nodes = existing_masters
 434 |   else:
 435 |     master_type = opts.master_instance_type
 436 |     if master_type == "":
 437 |       master_type = opts.instance_type
 438 |     if opts.zone == 'all':
 439 |       opts.zone = random.choice(conn.get_all_zones()).name
 440 |     if opts.spot_price != None:
 441 |       best_price = find_best_price(conn,master_type,opts.zone,opts.spot_price)
 442 |       # Launch spot instances with the requested price
 443 |       print >> stderr, ("Requesting master as spot instances with price $%.3f/hour" % (best_price))
 444 | 
 445 |       interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True)
 446 |       interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
 447 | 
 448 |       master_reqs = conn.request_spot_instances(
 449 |         price = best_price,
 450 |         image_id = opts.ami,
 451 |         launch_group = "launch-group-%s" % cluster_name,
 452 |         placement = opts.zone,
 453 |         count = 1,
 454 |         key_name = opts.key_pair,
 455 |         instance_type = master_type,
 456 |         block_device_map = block_map,
 457 |         user_data = user_data,
 458 |         instance_profile_arn = opts.profile,
 459 |         network_interfaces = interfaces)
 460 |       my_req_ids = [r.id for r in master_reqs]
 461 |       print >> stderr, "Waiting for spot instance to be granted"
 462 |       try:
 463 |         while True:
 464 |           time.sleep(10)
 465 |           reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids)
 466 |           id_to_req = {}
 467 |           for r in reqs:
 468 |             id_to_req[r.id] = r
 469 |           active_instance_ids = []
 470 |           for i in my_req_ids:
 471 |             if i in id_to_req and id_to_req[i].state == "active":
 472 |               active_instance_ids.append(id_to_req[i].instance_id)
 473 |           if len(active_instance_ids) == 1:
 474 |             print >> stderr, "Master granted"
 475 |             reservations = conn.get_all_instances(active_instance_ids)
 476 |             master_nodes = []
 477 |             for r in reservations:
 478 |               master_nodes += r.instances
 479 |             break
 480 |           else:
 481 |             # print >> stderr, ".",
 482 |             print "%d of %d masters granted, waiting longer" % (
 483 |               len(active_instance_ids), 1)
 484 |       except:
 485 |         print >> stderr, "Canceling spot instance requests"
 486 |         conn.cancel_spot_instance_requests(my_req_ids)
 487 |         # Log a warning if any of these requests actually launched instances:
 488 |         (master_nodes, master_nodes) = get_existing_cluster(
 489 |             conn, opts, cluster_name, die_on_error=False)
 490 |         running = len(master_nodes) + len(master_nodes)
 491 |         if running:
 492 |           print >> stderr, ("WARNING: %d instances are still running" % running)
 493 |         sys.exit(0)
 494 |     else:
 495 |       master_res = image.run(key_name = opts.key_pair,
 496 |                              security_group_ids = [master_group.id],
 497 |                              instance_type = master_type,
 498 |                              subnet_id = subnetId(),
 499 |                              placement = opts.zone,
 500 |                              min_count = 1,
 501 |                              max_count = 1,
 502 |                              block_device_map = block_map,
 503 |                              user_data = user_data,
 504 |                              instance_profile_arn = opts.profile)
 505 |       master_nodes = master_res.instances
 506 |       print >> stderr,"Launched master in %s, regid = %s" % (zone, master_res.id)
 507 |   # Return all the instances
 508 |   return (master_nodes, slave_nodes)
 509 | 
 510 | 
 511 | # Get the EC2 instances in an existing cluster if available.
 512 | # Returns a tuple of lists of EC2 instance objects for the masters and slaves
 513 | def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
 514 |   print >> stderr,"Searching for existing cluster %s ..." % cluster_name
 515 |   reservations = conn.get_all_instances()
 516 |   master_nodes = []
 517 |   slave_nodes = []
 518 |   for res in reservations:
 519 |     active = [i for i in res.instances if is_active(i)]
 520 |     for inst in active:
 521 |       group_names = [g.name for g in inst.groups]
 522 |       if (cluster_name + "-master") in group_names:
 523 |         master_nodes.append(inst)
 524 |       elif (cluster_name + "-slaves") in group_names:
 525 |         slave_nodes.append(inst)
 526 |   if any((master_nodes, slave_nodes)):
 527 |     print "Spark standalone cluster started at http://%s:8080" % master_nodes[0].public_dns_name
 528 |     print "Spark private ip address %s" % master_nodes[0].private_dns_name
 529 |     print >> stderr, "Spark standalone cluster started at http://%s:8080" % master_nodes[0].public_dns_name
 530 |     print >> stderr,("Found %d master(s), %d slaves" %
 531 |            (len(master_nodes), len(slave_nodes)))
 532 |     get_master_setup_files(master_nodes[0].public_dns_name, opts)
 533 |     if opts.ganglia:
 534 |       print >> stderr,"Ganglia started at http://%s:5080/ganglia" % master_nodes[0].public_dns_name
 535 |   if master_nodes != [] or not die_on_error:
 536 |     return (master_nodes, slave_nodes)
 537 |   else:
 538 |     if master_nodes == [] and slave_nodes != []:
 539 |       print "ERROR: Could not find master in group %s-master" %cluster_name
 540 |     else:
 541 |       print "ERROR: Could not find any existing cluster"
 542 |     sys.exit(1)
 543 | 
 544 | 
 545 | # Deploy configuration files and run setup scripts on a newly launched
 546 | # or started EC2 cluster.
 547 | def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
 548 | 
 549 |   master_nodes[0].update()
 550 |   master = master_nodes[0]
 551 |   print  "Spark private ip address %s" % master.private_dns_name
 552 |   if deploy_ssh_key:
 553 |     print "Generating cluster's SSH key on master..."
 554 |     key_setup = """
 555 |       [ -f ~/.ssh/id_rsa ] ||
 556 |         (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
 557 |          cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)
 558 |     """
 559 |     ssh(master.private_dns_name, opts, key_setup)
 560 |     dot_ssh_tar = ssh_read(master.private_dns_name, opts, ['tar', 'c', '.ssh'])
 561 |     print >> stderr, "Transferring cluster's SSH key to slaves..."
 562 |     for slave in slave_nodes:
 563 |       slave.update()
 564 |       ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
 565 | 
 566 |   modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
 567 |              'mapreduce', 'spark-standalone']
 568 | 
 569 |   if opts.hadoop_major_version == "1":
 570 |     modules = filter(lambda x: x != "mapreduce", modules)
 571 | 
 572 |   if opts.ganglia:
 573 |     modules.append('ganglia')
 574 | 
 575 |   # NOTE: We should clone the repository before running deploy_files to
 576 |   # prevent ec2-variables.sh from being overwritten
 577 |   ssh(
 578 |     host=master.private_dns_name,
 579 |     opts=opts,
 580 |     command="rm -rf spark-ec2"
 581 |     + " && "
 582 |     + "git clone https://github.com/paulomagalhaes/spark-ec2.git -b {b}".format(b=MESOS_SPARK_EC2_BRANCH)
 583 |   )
 584 | 
 585 |   print >> stderr,"Deploying files to master... "
 586 |   (path, name) = os.path.split(__file__)
 587 |   deploy_files(conn, path+"/deploy.generic", opts, master_nodes, slave_nodes, modules)
 588 | 
 589 |   print >> stderr,"Running setup on master... "
 590 |   setup_spark_cluster(master, opts)
 591 |   get_master_setup_files(master.private_dns_name, opts)
 592 |   print >> stderr,"Done!"
 593 | 
 594 | def get_master_setup_files(master, opts):
 595 |   scp(master, opts, "spark/lib/datanucleus*.jar", "%s/../lib" % SPARK_EC2_DIR)
 596 |   scp(master, opts, "spark/conf/*", SPARK_EC2_DIR)
 597 | 
 598 | def setup_standalone_cluster(master, slave_nodes, opts):
 599 |   slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes])
 600 |   ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips))
 601 |   ssh(master, opts, "/root/spark/sbin/start-all.sh")
 602 | 
 603 | def setup_spark_cluster(master, opts):
 604 |   ssh(master.private_dns_name, opts, "chmod u+x spark-ec2/setup.sh")
 605 |   ssh(master.private_dns_name, opts, "spark-ec2/setup.sh")
 606 |   print "Spark standalone cluster started at http://%s:8080" % master.public_dns_name
 607 |   print >> stderr, "Spark standalone cluster started at http://%s:8080" % master.public_dns_name
 608 |   if opts.ganglia:
 609 |     print >> stderr,"Ganglia started at http://%s:5080/ganglia" % master.public_dns_name
 610 | 
 611 | 
 612 | 
 613 | # Wait for a whole cluster (masters, slaves and ZooKeeper) to start up
 614 | def wait_for_cluster(conn, wait_secs, master_nodes, slave_nodes):
 615 |   print >> stderr,"Waiting for instances to start up..."
 616 |   time.sleep(5)
 617 |   wait_for_instances(conn, master_nodes)
 618 |   wait_for_instances(conn, slave_nodes)
 619 | 
 620 | 
 621 | # Get number of local disks available for a given EC2 instance type.
 622 | def get_num_disks(instance_type):
 623 |   # From http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html
 624 |   # Updated 2014-6-20
 625 |   disks_by_instance = {
 626 |     "m1.small":    1,
 627 |     "m1.medium":   1,
 628 |     "m1.large":    2,
 629 |     "m1.xlarge":   4,
 630 |     "t1.micro":    1,
 631 |     "c1.medium":   1,
 632 |     "c1.xlarge":   4,
 633 |     "m2.xlarge":   1,
 634 |     "m2.2xlarge":  1,
 635 |     "m2.4xlarge":  2,
 636 |     "cc1.4xlarge": 2,
 637 |     "cc2.8xlarge": 4,
 638 |     "cg1.4xlarge": 2,
 639 |     "hs1.8xlarge": 24,
 640 |     "cr1.8xlarge": 2,
 641 |     "hi1.4xlarge": 2,
 642 |     "m3.medium":   1,
 643 |     "m3.large":    1,
 644 |     "m3.xlarge":   2,
 645 |     "m3.2xlarge":  2,
 646 |     "i2.xlarge":   1,
 647 |     "i2.2xlarge":  2,
 648 |     "i2.4xlarge":  4,
 649 |     "i2.8xlarge":  8,
 650 |     "c3.large":    2,
 651 |     "c3.xlarge":   2,
 652 |     "c3.2xlarge":  2,
 653 |     "c3.4xlarge":  2,
 654 |     "c3.8xlarge":  2,
 655 |     "r3.large":    1,
 656 |     "r3.xlarge":   1,
 657 |     "r3.2xlarge":  1,
 658 |     "r3.4xlarge":  1,
 659 |     "r3.8xlarge":  2,
 660 |     "g2.2xlarge":  1,
 661 |     "t1.micro":    0
 662 |   }
 663 |   if instance_type in disks_by_instance:
 664 |     return disks_by_instance[instance_type]
 665 |   else:
 666 |     print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1"
 667 |                       % instance_type)
 668 |     return 1
 669 | 
 670 | 
 671 | # Deploy the configuration file templates in a given local directory to
 672 | # a cluster, filling in any template parameters with information about the
 673 | # cluster (e.g. lists of masters and slaves). Files are only deployed to
 674 | # the first master instance in the cluster, and we expect the setup
 675 | # script to be run on that instance to copy them to other nodes.
 676 | def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
 677 |   active_master = master_nodes[0].public_dns_name
 678 | 
 679 |   num_disks = get_num_disks(opts.instance_type)
 680 |   hdfs_data_dirs = "/mnt/ephemeral-hdfs/data"
 681 |   mapred_local_dirs = "/mnt/hadoop/mrlocal"
 682 |   spark_local_dirs = "/mnt/spark"
 683 |   if num_disks > 1:
 684 |     for i in range(2, num_disks + 1):
 685 |       hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i
 686 |       mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i
 687 |       spark_local_dirs += ",/mnt%d/spark" % i
 688 | 
 689 |   cluster_url = "%s:7077" % active_master
 690 | 
 691 |   if "." in opts.spark_version:
 692 |     # Pre-built spark & shark deploy
 693 |     (spark_v, shark_v) = get_spark_shark_version(opts)
 694 |   else:
 695 |     # Spark-only custom deploy
 696 |     spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
 697 |     shark_v = ""
 698 |     modules = filter(lambda x: x != "shark", modules)
 699 | 
 700 |   template_vars = {
 701 |     "master_list": '\n'.join([i.public_dns_name for i in master_nodes]),
 702 |     "active_master": active_master,
 703 |     "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]),
 704 |     "cluster_url": cluster_url,
 705 |     "hdfs_data_dirs": hdfs_data_dirs,
 706 |     "mapred_local_dirs": mapred_local_dirs,
 707 |     "spark_local_dirs": spark_local_dirs,
 708 |     "swap": str(opts.swap),
 709 |     "modules": '\n'.join(modules),
 710 |     "spark_version": spark_v,
 711 |     "shark_version": shark_v,
 712 |     "hadoop_major_version": opts.hadoop_major_version,
 713 |     "metastore_user": "hive",
 714 |     "metastore_passwd": ''.join(random.SystemRandom().choice(string.uppercase + string.digits) for _ in xrange(10)),
 715 |     "spark_worker_instances": "%d" % opts.worker_instances,
 716 |     "spark_master_opts": opts.master_opts
 717 |   }
 718 | 
 719 |   # Create a temp directory in which we will place all the files to be
 720 |   # deployed after we substitue template parameters in them
 721 |   print root_dir
 722 |   tmp_dir = tempfile.mkdtemp()
 723 |   for path, dirs, files in os.walk(root_dir):
 724 |     if path.find(".svn") == -1:
 725 |       dest_dir = os.path.join('/', path[len(root_dir):])
 726 |       local_dir = tmp_dir + dest_dir
 727 |       if not os.path.exists(local_dir):
 728 |         os.makedirs(local_dir)
 729 |       for filename in files:
 730 |         if filename[0] not in '#.~' and filename[-1] != '~':
 731 |           dest_file = os.path.join(dest_dir, filename)
 732 |           local_file = tmp_dir + dest_file
 733 |           with open(os.path.join(path, filename)) as src:
 734 |             with open(local_file, "w") as dest:
 735 |               text = src.read()
 736 |               for key in template_vars:
 737 |                 text = text.replace("{{" + key + "}}", template_vars[key])
 738 |               dest.write(text)
 739 |               dest.close()
 740 |   # rsync the whole directory over to the master machine
 741 |   command = [
 742 |       'rsync', '-rv',
 743 |       '-e', stringify_command(ssh_command(opts)),
 744 |       "%s/" % tmp_dir,
 745 |       "%s@%s:/" % (opts.user, active_master)
 746 |     ]
 747 |   subprocess.check_call(command)
 748 |   # Remove the temp directory we created above
 749 |   shutil.rmtree(tmp_dir)
 750 |   print tmp_dir
 751 | 
 752 | 
 753 | def stringify_command(parts):
 754 |   if isinstance(parts, str):
 755 |     return parts
 756 |   else:
 757 |     return ' '.join(map(pipes.quote, parts))
 758 | 
 759 | 
 760 | def ssh_args(opts):
 761 |   parts = ['-o', 'StrictHostKeyChecking=no', '-o LogLevel=error']
 762 |   # parts += ['-i', '~/.ssh/id_rsa']
 763 |   return parts
 764 | 
 765 | 
 766 | def ssh_command(opts):
 767 |   return ['ssh'] + ssh_args(opts)
 768 | 
 769 | def scp_command(opts):
 770 |   return ['scp'] + ssh_args(opts)
 771 | 
 772 | def pub_key():
 773 |   key_gen = """[ -f ~/.ssh/id_rsa ] ||
 774 |         (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa)
 775 |   """
 776 |   subprocess.check_call(key_gen, shell=True)
 777 |   return subprocess.Popen("cat ~/.ssh/id_rsa.pub", shell=True, stdout=subprocess.PIPE).communicate()[0]
 778 | 
 779 | def profile():
 780 |   return subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/iam/info | grep InstanceProfileArn""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3]
 781 | 
 782 | def region():
 783 |   return subprocess.Popen("""curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep region""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3]
 784 | 
 785 | def zone():
 786 |   return subprocess.Popen("""curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep availabilityZone""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("\"")[3]
 787 | 
 788 | def subnetId():
 789 |   mac = subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/ | grep /""", shell=True, stdout=subprocess.PIPE).communicate()[0].split("/")[0]
 790 |   return subprocess.Popen("""curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/""" + mac + """/subnet-id/""", shell=True, stdout=subprocess.PIPE).communicate()[0]
 791 | 
 792 | def keypair():
 793 |     return subprocess.Popen("""curl -s  http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key""", shell=True, stdout=subprocess.PIPE).communicate()[0].split(" ")[2].strip()
 794 | 
 795 | # Run a command on a host through ssh, retrying up to five times
 796 | # and then throwing an exception if ssh continues to fail.
 797 | def ssh(host, opts, command):
 798 |   tries = 0
 799 |   while True:
 800 |     try:
 801 |       #print >> stderr, ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), stringify_command(command)]
 802 |       return subprocess.check_call(
 803 |         ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), stringify_command(command)])
 804 |     except subprocess.CalledProcessError as e:
 805 |       if (tries > 10):
 806 |         print >> stderr,'Failed to SSH to remote host %s after %s retries.' % (host, tries)
 807 |         # If this was an ssh failure, provide the user with hints.
 808 |         if e.returncode == 255:
 809 |           raise UsageError('Failed to SSH to remote host %s.\nPlease check that you have provided the correct --identity-file and --key-pair parameters and try again.' % (host))
 810 |         else:
 811 |           raise e
 812 |       #print >> stderr,"Error executing remote command, retrying after 30 seconds: {0}".format(e)
 813 |       time.sleep(30)
 814 |       tries = tries + 1
 815 | 
 816 | def scp(host, opts, src, target):
 817 |   tries = 0
 818 |   while True:
 819 |     try:
 820 |       return subprocess.check_call(
 821 |         scp_command(opts) + ['%s@%s:%s' % (opts.user, host,src), target])
 822 |     except subprocess.CalledProcessError as e:
 823 |       if (tries > 10):
 824 |         print >> stderr,"Failed to SCP to remote host {0} after r retries.".format(host)
 825 |         # If this was an ssh failure, provide the user with hints.
 826 |         if e.returncode == 255:
 827 |           raise UsageError("Failed to SCP to remote host {0}.\nPlease check that you have provided the correct --identity-file and --key-pair parameters and try again.".format(host))
 828 |         else:
 829 |           raise e
 830 |       time.sleep(30)
 831 |       tries = tries + 1
 832 | 
 833 | 
 834 | # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
 835 | def _check_output(*popenargs, **kwargs):
 836 |     if 'stdout' in kwargs:
 837 |         raise ValueError('stdout argument not allowed, it will be overridden.')
 838 |     process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
 839 |     output, unused_err = process.communicate()
 840 |     retcode = process.poll()
 841 |     if retcode:
 842 |         cmd = kwargs.get("args")
 843 |         if cmd is None:
 844 |             cmd = popenargs[0]
 845 |         raise subprocess.CalledProcessError(retcode, cmd, output=output)
 846 |     return output
 847 | 
 848 | 
 849 | def ssh_read(host, opts, command):
 850 |   return _check_output(
 851 |       ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)])
 852 | 
 853 | 
 854 | def ssh_write(host, opts, command, input):
 855 |   tries = 0
 856 |   while True:
 857 |     proc = subprocess.Popen(
 858 |         ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)],
 859 |         stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
 860 |     proc.stdin.write(input)
 861 |     proc.stdin.close()
 862 |     status = proc.wait()
 863 |     if status == 0:
 864 |       break
 865 |     elif (tries > 5):
 866 |       raise RuntimeError("ssh_write failed with error %s" % proc.returncode)
 867 |     else:
 868 |       print >> stderr, "Error {0} while executing remote command, retrying after 30 seconds".format(status)
 869 |       time.sleep(30)
 870 |       tries = tries + 1
 871 | 
 872 | 
 873 | # Gets a list of zones to launch instances in
 874 | def get_zones(conn, opts):
 875 |   if opts.zone == 'all':
 876 |     zones = [z.name for z in conn.get_all_zones()]
 877 |   else:
 878 |     zones = [opts.zone]
 879 |   return zones
 880 | 
 881 | 
 882 | # Gets the number of items in a partition
 883 | def get_partition(total, num_partitions, current_partitions):
 884 |   num_slaves_this_zone = total / num_partitions
 885 |   if (total % num_partitions) - current_partitions > 0:
 886 |     num_slaves_this_zone += 1
 887 |   return num_slaves_this_zone
 888 | 
 889 | 
 890 | def real_main():
 891 |   (opts, action, cluster_name) = parse_args()
 892 |   try:
 893 |     conn = ec2.connect_to_region(opts.region)
 894 |   except Exception as e:
 895 |     print >> stderr,(e)
 896 |     sys.exit(1)
 897 | 
 898 |   # Select an AZ at random if it was not specified.
 899 |   if opts.zone == "":
 900 |     opts.zone = random.choice(conn.get_all_zones()).name
 901 | 
 902 |   if action == "launch":
 903 |     if opts.slaves <= 0:
 904 |       print >> sys.stderr, "ERROR: You have to start at least 1 slave"
 905 |       sys.exit(1)
 906 |     if opts.resume:
 907 |       (master_nodes, slave_nodes) = get_existing_cluster(
 908 |           conn, opts, cluster_name)
 909 |     else:
 910 |       start_secs = time.time()
 911 |       (master_nodes, slave_nodes) = launch_cluster(
 912 |           conn, opts, cluster_name)
 913 |       wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes)
 914 |       print >> stderr, "Provisioning took %.3f minutes" % ((time.time() - start_secs) / 60.0)
 915 |       start_secs = time.time()
 916 |       setup_cluster(conn, master_nodes, slave_nodes, opts, True)
 917 |       print >> stderr,"Setup took %.3f minutes" % ((time.time() - start_secs)/60.0)
 918 | 
 919 |   elif action == "destroy":
 920 |     (master_nodes, slave_nodes) = get_existing_cluster(
 921 |         conn, opts, cluster_name, die_on_error=False)
 922 |     print >> stderr,"Terminating master..."
 923 |     for inst in master_nodes:
 924 |       inst.terminate()
 925 |     print >> stderr,"Terminating slaves..."
 926 |     for inst in slave_nodes:
 927 |       inst.terminate()
 928 | 
 929 |     # Delete security groups as well
 930 |     if opts.delete_groups:
 931 |       print >> stderr,"Deleting security groups (this will take some time)..."
 932 |       group_names = [cluster_name + "-master", cluster_name + "-slaves"]
 933 | 
 934 |       attempt = 1;
 935 |       while attempt <= 3:
 936 |         print >> stderr,"Attempt %d" % attempt
 937 |         groups = [g for g in conn.get_all_security_groups() if g.name in group_names]
 938 |         success = True
 939 |         # Delete individual rules in all groups before deleting groups to
 940 |         # remove dependencies between them
 941 |         for group in groups:
 942 |           print >> stderr,"Deleting rules in security group " + group.name
 943 |           for rule in group.rules:
 944 |             for grant in rule.grants:
 945 |                 success &= group.revoke(ip_protocol=rule.ip_protocol,
 946 |                          from_port=rule.from_port,
 947 |                          to_port=rule.to_port,
 948 |                          src_group=grant)
 949 | 
 950 |         # Sleep for AWS eventual-consistency to catch up, and for instances
 951 |         # to terminate
 952 |         time.sleep(30)  # Yes, it does have to be this long :-(
 953 |         for group in groups:
 954 |           try:
 955 |             conn.delete_security_group(group.name)
 956 |             print >> stderr,"Deleted security group " + group.name
 957 |           except boto.exception.EC2ResponseError:
 958 |             success = False;
 959 |             print >> stderr,"Failed to delete security group " + group.name
 960 | 
 961 |         # Unfortunately, group.revoke() returns True even if a rule was not
 962 |         # deleted, so this needs to be rerun if something fails
 963 |         if success: break;
 964 | 
 965 |         attempt += 1
 966 | 
 967 |       if not success:
 968 |         print >> stderr,"Failed to delete all security groups after 3 tries."
 969 |         print >> stderr,"Try re-running in a few minutes."
 970 | 
 971 |   elif action == "login":
 972 |     (master_nodes, slave_nodes) = get_existing_cluster(
 973 |         conn, opts, cluster_name)
 974 |     master = master_nodes[0].public_dns_name
 975 |     print "Logging into master " + master + "..."
 976 |     proxy_opt = []
 977 |     if opts.proxy_port != None:
 978 |       proxy_opt = ['-D', opts.proxy_port]
 979 |     subprocess.check_call(
 980 |         ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
 981 | 
 982 |   elif action == "get-master":
 983 |     (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
 984 |     print master_nodes[0].public_dns_name
 985 | 
 986 |   elif action == "stop":
 987 |     response = raw_input("Are you sure you want to stop the cluster " +
 988 |         cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
 989 |         "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" +
 990 |         "AMAZON EBS IF IT IS EBS-BACKED!!\n" +
 991 |         "All data on spot-instance slaves will be lost.\n" +
 992 |         "Stop cluster " + cluster_name + " (y/N): ")
 993 |     if response == "y":
 994 |       (master_nodes, slave_nodes) = get_existing_cluster(
 995 |           conn, opts, cluster_name, die_on_error=False)
 996 |       print >> stderr,"Stopping master..."
 997 |       for inst in master_nodes:
 998 |         if inst.state not in ["shutting-down", "terminated"]:
 999 |           inst.stop()
1000 |       print >> stderr,"Stopping slaves..."
1001 |       for inst in slave_nodes:
1002 |         if inst.state not in ["shutting-down", "terminated"]:
1003 |           if inst.spot_instance_request_id:
1004 |             inst.terminate()
1005 |           else:
1006 |             inst.stop()
1007 | 
1008 |   elif action == "start":
1009 |     (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
1010 |     print >> stderr,"Starting slaves..."
1011 |     for inst in slave_nodes:
1012 |       if inst.state not in ["shutting-down", "terminated"]:
1013 |         inst.start()
1014 |     print >> stderr,"Starting master..."
1015 |     for inst in master_nodes:
1016 |       if inst.state not in ["shutting-down", "terminated"]:
1017 |         inst.start()
1018 |     wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes)
1019 |     setup_cluster(conn, master_nodes, slave_nodes, opts, False)
1020 | 
1021 |   else:
1022 |     print >> stderr,"Invalid action: %s" % action
1023 |     sys.exit(1)
1024 | 
1025 | def find_best_price(conn,instance,zone, factor):
1026 |   last_hour_zone = get_spot_price(conn,zone,datetime.utcnow()-timedelta(hours=1),instance)
1027 |   average_price_last_hour = sum(i.price for i in last_hour_zone)/float(len(last_hour_zone))
1028 |   return average_price_last_hour*factor
1029 | 
1030 | def get_spot_price(conn,zone,start_date_hour,instance):
1031 |     return conn.get_spot_price_history(start_time=start_date_hour.strftime("%Y-%m-%dT%H:%M:%SZ"),end_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),instance_type=instance , product_description="Linux/UNIX",availability_zone=zone)
1032 | 
1033 | def main():
1034 |   try:
1035 |     real_main()
1036 |   except UsageError, e:
1037 |     print >> stderr,"\nError:\n", e
1038 |     sys.exit(1)
1039 | 
1040 | 
1041 | if __name__ == "__main__":
1042 |   logging.basicConfig()
1043 |   main()
1044 | 


--------------------------------------------------------------------------------