├── .gitignore
├── AUTHORS
├── LICENSE
├── README.md
├── application-deployment-fabfile.py
├── config.yaml.tmpl
├── env.sh
├── images
    ├── application-deployment-1.png
    ├── initial-deployment-1.png
    ├── initial-deployment-2.png
    └── initial-deployment-3.png
├── initial-deployment-fabfile.py
├── initial-deployment-puppet
    ├── manifests
    │   ├── adobe_hadoop.pp.tmpl
    │   ├── hdfs-master.pp
    │   ├── hdfs-worker.pp
    │   ├── spark-master.pp
    │   └── spark-worker.pp.tmpl
    └── modules
    │   ├── cdh4
    │       ├── .gitreview
    │       ├── LICENSE
    │       ├── README.md
    │       ├── TODO.md
    │       ├── files
    │       │   └── hue
    │       │   │   └── hue.init.d.sh
    │       ├── manifests
    │       │   ├── hadoop.pp
    │       │   ├── hadoop
    │       │   │   ├── datanode.pp
    │       │   │   ├── defaults.pp
    │       │   │   ├── directory.pp
    │       │   │   ├── historyserver.pp
    │       │   │   ├── jmxtrans
    │       │   │   │   ├── README.md
    │       │   │   │   ├── datanode.pp
    │       │   │   │   ├── master.pp
    │       │   │   │   ├── namenode.pp
    │       │   │   │   ├── nodemanager.pp
    │       │   │   │   ├── resourcemanager.pp
    │       │   │   │   └── worker.pp
    │       │   │   ├── jobtracker.pp
    │       │   │   ├── journalnode.pp
    │       │   │   ├── master.pp
    │       │   │   ├── namenode.pp
    │       │   │   ├── namenode
    │       │   │   │   ├── primary.pp
    │       │   │   │   └── standby.pp
    │       │   │   ├── nodemanager.pp
    │       │   │   ├── resourcemanager.pp
    │       │   │   ├── tasktracker.pp
    │       │   │   ├── worker.pp
    │       │   │   └── worker
    │       │   │   │   └── paths.pp
    │       │   ├── hcatalog.pp
    │       │   ├── hive.pp
    │       │   ├── hive
    │       │   │   ├── defaults.pp
    │       │   │   ├── master.pp
    │       │   │   ├── metastore.pp
    │       │   │   ├── metastore
    │       │   │   │   └── mysql.pp
    │       │   │   └── server.pp
    │       │   ├── hue.pp
    │       │   ├── hue
    │       │   │   └── defaults.pp
    │       │   ├── oozie.pp
    │       │   ├── oozie
    │       │   │   ├── database
    │       │   │   │   └── mysql.pp
    │       │   │   ├── defaults.pp
    │       │   │   └── server.pp
    │       │   ├── pig.pp
    │       │   └── sqoop.pp
    │       ├── templates
    │       │   ├── hadoop
    │       │   │   ├── core-site.xml.erb
    │       │   │   ├── hadoop-env.sh.erb
    │       │   │   ├── hadoop-metrics2.properties.erb
    │       │   │   ├── hdfs-site.xml.erb
    │       │   │   ├── httpfs-site.xml.erb
    │       │   │   ├── log4j.properties.erb
    │       │   │   ├── mapred-site.xml.erb
    │       │   │   ├── yarn-env.sh.erb
    │       │   │   └── yarn-site.xml.erb
    │       │   ├── hive
    │       │   │   ├── hive-exec-log4j.properties.erb
    │       │   │   └── hive-site.xml.erb
    │       │   ├── hue
    │       │   │   └── hue.ini.erb
    │       │   ├── oozie
    │       │   │   ├── oozie-env.sh.erb
    │       │   │   └── oozie-site.xml.erb
    │       │   └── pig
    │       │   │   └── pig.properties.erb
    │       └── tests
    │       │   ├── Makefile
    │       │   ├── datanode.pp
    │       │   ├── defaults.pp
    │       │   ├── hadoop.pp
    │       │   ├── historyserver.pp
    │       │   ├── hive.pp
    │       │   ├── hive_master.pp
    │       │   ├── hive_metastore.pp
    │       │   ├── hive_metastore_mysql.pp
    │       │   ├── hive_server.pp
    │       │   ├── jobtracker.pp
    │       │   ├── master.pp
    │       │   ├── namenode.pp
    │       │   ├── namenode_primary.pp
    │       │   ├── namenode_standby.pp
    │       │   ├── nodemanager.pp
    │       │   ├── pig.pp
    │       │   ├── resourcemanager.pp
    │       │   ├── sqoop.pp
    │       │   ├── tasktracker.pp
    │       │   └── worker.pp
    │   └── spark
    │       ├── LICENSE
    │       ├── README.md
    │       ├── files
    │           └── spark
    │           │   ├── CHANGES.txt
    │           │   ├── LICENSE
    │           │   ├── NOTICE
    │           │   ├── README.md
    │           │   ├── RELEASE
    │           │   ├── bin
    │           │       ├── compute-classpath.cmd
    │           │       ├── compute-classpath.sh
    │           │       ├── load-spark-env.sh
    │           │       ├── pyspark
    │           │       ├── pyspark.cmd
    │           │       ├── pyspark2.cmd
    │           │       ├── run-example
    │           │       ├── run-example.cmd
    │           │       ├── run-example2.cmd
    │           │       ├── spark-class
    │           │       ├── spark-class.cmd
    │           │       ├── spark-class2.cmd
    │           │       ├── spark-shell
    │           │       ├── spark-shell.cmd
    │           │       ├── spark-submit
    │           │       └── spark-submit.cmd
    │           │   ├── conf
    │           │       ├── fairscheduler.xml.template
    │           │       ├── log4j.properties.template
    │           │       ├── metrics.properties.template
    │           │       ├── slaves
    │           │       ├── spark-defaults.conf.template
    │           │       └── spark-env.sh.template
    │           │   ├── ec2
    │           │       ├── README
    │           │       ├── deploy.generic
    │           │       │   └── root
    │           │       │   │   └── spark-ec2
    │           │       │   │       └── ec2-variables.sh
    │           │       ├── spark-ec2
    │           │       ├── spark_ec2.py
    │           │       └── third_party
    │           │       │   └── boto-2.4.1.zip
    │           │   ├── examples
    │           │       └── src
    │           │       │   └── main
    │           │       │       ├── java
    │           │       │           └── org
    │           │       │           │   └── apache
    │           │       │           │       └── spark
    │           │       │           │           └── examples
    │           │       │           │               ├── JavaHdfsLR.java
    │           │       │           │               ├── JavaLogQuery.java
    │           │       │           │               ├── JavaPageRank.java
    │           │       │           │               ├── JavaSparkPi.java
    │           │       │           │               ├── JavaTC.java
    │           │       │           │               ├── JavaWordCount.java
    │           │       │           │               ├── mllib
    │           │       │           │                   ├── JavaALS.java
    │           │       │           │                   ├── JavaKMeans.java
    │           │       │           │                   └── JavaLR.java
    │           │       │           │               ├── sql
    │           │       │           │                   └── JavaSparkSQL.java
    │           │       │           │               └── streaming
    │           │       │           │                   ├── JavaCustomReceiver.java
    │           │       │           │                   ├── JavaFlumeEventCount.java
    │           │       │           │                   ├── JavaKafkaWordCount.java
    │           │       │           │                   ├── JavaNetworkWordCount.java
    │           │       │           │                   └── JavaQueueStream.java
    │           │       │       ├── python
    │           │       │           ├── als.py
    │           │       │           ├── kmeans.py
    │           │       │           ├── logistic_regression.py
    │           │       │           ├── mllib
    │           │       │           │   ├── kmeans.py
    │           │       │           │   └── logistic_regression.py
    │           │       │           ├── pagerank.py
    │           │       │           ├── pi.py
    │           │       │           ├── sort.py
    │           │       │           ├── transitive_closure.py
    │           │       │           └── wordcount.py
    │           │       │       ├── resources
    │           │       │           ├── kv1.txt
    │           │       │           └── people.txt
    │           │       │       └── scala
    │           │       │           └── org
    │           │       │               └── apache
    │           │       │                   └── spark
    │           │       │                       └── examples
    │           │       │                           ├── BroadcastTest.scala
    │           │       │                           ├── CassandraCQLTest.scala
    │           │       │                           ├── CassandraTest.scala
    │           │       │                           ├── DriverSubmissionTest.scala
    │           │       │                           ├── ExceptionHandlingTest.scala
    │           │       │                           ├── GroupByTest.scala
    │           │       │                           ├── HBaseTest.scala
    │           │       │                           ├── HdfsTest.scala
    │           │       │                           ├── LocalALS.scala
    │           │       │                           ├── LocalFileLR.scala
    │           │       │                           ├── LocalKMeans.scala
    │           │       │                           ├── LocalLR.scala
    │           │       │                           ├── LocalPi.scala
    │           │       │                           ├── LogQuery.scala
    │           │       │                           ├── MultiBroadcastTest.scala
    │           │       │                           ├── SimpleSkewedGroupByTest.scala
    │           │       │                           ├── SkewedGroupByTest.scala
    │           │       │                           ├── SparkALS.scala
    │           │       │                           ├── SparkHdfsLR.scala
    │           │       │                           ├── SparkKMeans.scala
    │           │       │                           ├── SparkLR.scala
    │           │       │                           ├── SparkPageRank.scala
    │           │       │                           ├── SparkPi.scala
    │           │       │                           ├── SparkTC.scala
    │           │       │                           ├── SparkTachyonHdfsLR.scala
    │           │       │                           ├── SparkTachyonPi.scala
    │           │       │                           ├── bagel
    │           │       │                               ├── PageRankUtils.scala
    │           │       │                               ├── WikipediaPageRank.scala
    │           │       │                               └── WikipediaPageRankStandalone.scala
    │           │       │                           ├── graphx
    │           │       │                               └── LiveJournalPageRank.scala
    │           │       │                           ├── mllib
    │           │       │                               ├── BinaryClassification.scala
    │           │       │                               ├── DecisionTreeRunner.scala
    │           │       │                               ├── DenseKMeans.scala
    │           │       │                               ├── LinearRegression.scala
    │           │       │                               ├── MovieLensALS.scala
    │           │       │                               ├── SparseNaiveBayes.scala
    │           │       │                               ├── TallSkinnyPCA.scala
    │           │       │                               └── TallSkinnySVD.scala
    │           │       │                           ├── sql
    │           │       │                               ├── RDDRelation.scala
    │           │       │                               └── hive
    │           │       │                               │   └── HiveFromSpark.scala
    │           │       │                           └── streaming
    │           │       │                               ├── ActorWordCount.scala
    │           │       │                               ├── CustomReceiver.scala
    │           │       │                               ├── FlumeEventCount.scala
    │           │       │                               ├── HdfsWordCount.scala
    │           │       │                               ├── KafkaWordCount.scala
    │           │       │                               ├── MQTTWordCount.scala
    │           │       │                               ├── NetworkWordCount.scala
    │           │       │                               ├── QueueStream.scala
    │           │       │                               ├── RawNetworkGrep.scala
    │           │       │                               ├── RecoverableNetworkWordCount.scala
    │           │       │                               ├── StatefulNetworkWordCount.scala
    │           │       │                               ├── StreamingExamples.scala
    │           │       │                               ├── TwitterAlgebirdCMS.scala
    │           │       │                               ├── TwitterAlgebirdHLL.scala
    │           │       │                               ├── TwitterPopularTags.scala
    │           │       │                               ├── ZeroMQWordCount.scala
    │           │       │                               └── clickstream
    │           │       │                                   ├── PageViewGenerator.scala
    │           │       │                                   └── PageViewStream.scala
    │           │   ├── lib
    │           │       ├── spark-assembly.1
    │           │       └── spark-assembly.2
    │           │   ├── python
    │           │       ├── .gitignore
    │           │       ├── epydoc.conf
    │           │       ├── lib
    │           │       │   ├── PY4J_LICENSE.txt
    │           │       │   └── py4j-0.8.1-src.zip
    │           │       ├── pyspark
    │           │       │   ├── __init__.py
    │           │       │   ├── accumulators.py
    │           │       │   ├── broadcast.py
    │           │       │   ├── cloudpickle.py
    │           │       │   ├── conf.py
    │           │       │   ├── context.py
    │           │       │   ├── daemon.py
    │           │       │   ├── files.py
    │           │       │   ├── java_gateway.py
    │           │       │   ├── join.py
    │           │       │   ├── mllib
    │           │       │   │   ├── __init__.py
    │           │       │   │   ├── _common.py
    │           │       │   │   ├── classification.py
    │           │       │   │   ├── clustering.py
    │           │       │   │   ├── linalg.py
    │           │       │   │   ├── recommendation.py
    │           │       │   │   ├── regression.py
    │           │       │   │   ├── tests.py
    │           │       │   │   └── util.py
    │           │       │   ├── rdd.py
    │           │       │   ├── rddsampler.py
    │           │       │   ├── resultiterable.py
    │           │       │   ├── serializers.py
    │           │       │   ├── shell.py
    │           │       │   ├── sql.py
    │           │       │   ├── statcounter.py
    │           │       │   ├── storagelevel.py
    │           │       │   ├── tests.py
    │           │       │   └── worker.py
    │           │       ├── run-tests
    │           │       └── test_support
    │           │       │   ├── hello.txt
    │           │       │   ├── userlib-0.1-py2.7.egg
    │           │       │   └── userlibrary.py
    │           │   └── sbin
    │           │       ├── slaves.sh
    │           │       ├── spark-config.sh
    │           │       ├── spark-daemon.sh
    │           │       ├── spark-daemons.sh
    │           │       ├── spark-executor
    │           │       ├── start-all.sh
    │           │       ├── start-history-server.sh
    │           │       ├── start-master.sh
    │           │       ├── start-slave.sh
    │           │       ├── start-slaves.sh
    │           │       ├── stop-all.sh
    │           │       ├── stop-history-server.sh
    │           │       ├── stop-master.sh
    │           │       └── stop-slaves.sh
    │       ├── manifests
    │           ├── defaults.pp
    │           ├── master.pp
    │           ├── spark.pp
    │           ├── user.pp
    │           └── worker.pp
    │       └── templates
    │           ├── metrics.properties.erb
    │           ├── spark-env.sh.erb
    │           ├── spark-master-runner.sh.erb
    │           ├── spark-master.conf.erb
    │           ├── spark-worker-runner.sh.erb
    │           └── spark-worker.conf.erb
├── requirements.txt
└── sample-application
    ├── .gitignore
    ├── build.sbt
    ├── config.yaml.tmpl
    └── src
        └── main
            └── scala
                └── ExampleApp.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | config.yaml
3 | project
4 | target
5 | adobe_hadoop.pp
6 | spark-worker.pp
7 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Brandon Amos <bamos@cmu.edu>
2 | David Tompkins <tompkins@adobe_dot_com>
3 | 


--------------------------------------------------------------------------------
/config.yaml.tmpl:
--------------------------------------------------------------------------------
 1 | ###########################################################################
 2 | ##
 3 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved.
 4 | ##
 5 | ## Licensed under the Apache License, Version 2.0 (the "License");
 6 | ## you may not use this file except in compliance with the License.
 7 | ## You may obtain a copy of the License at
 8 | ##
 9 | ## http://www.apache.org/licenses/LICENSE-2.0
10 | ##
11 | ## Unless required by applicable law or agreed to in writing, software
12 | ## distributed under the License is distributed on an "AS IS" BASIS,
13 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | ## See the License for the specific language governing permissions and
15 | ## limitations under the License.
16 | ##
17 | ###########################################################################
18 | 
19 | master:
20 |   - server0
21 | all: &all
22 |   - server0
23 |   - server1
24 |   - server2
25 |   - server3
26 |   - server4
27 |   - server5
28 | workers:
29 |   *all
30 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
 1 | # env.sh
 2 | # Source this script for Spark standalone deployment shell functions.
 3 | #
 4 | ###########################################################################
 5 | ##
 6 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved.
 7 | ##
 8 | ## Licensed under the Apache License, Version 2.0 (the "License");
 9 | ## you may not use this file except in compliance with the License.
10 | ## You may obtain a copy of the License at
11 | ##
12 | ## http://www.apache.org/licenses/LICENSE-2.0
13 | ##
14 | ## Unless required by applicable law or agreed to in writing, software
15 | ## distributed under the License is distributed on an "AS IS" BASIS,
16 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | ## See the License for the specific language governing permissions and
18 | ## limitations under the License.
19 | ##
20 | ###########################################################################
21 | 
22 | DEPLOY_DIR="$( cd "$( dirname "$0" )" && pwd )"
23 | INITIAL_DIR=$DEPLOY_DIR+"/initial-deployment"
24 | APPLICATION_DIR=$DEPLOY_DIR+"/application-deployment"
25 | 
26 | # Initial deployment shell aliases/functions.
27 | function spark-init() {
28 |   fab -f $DEPLOY_DIR/initial-deployment-fabfile.py $*
29 | }
30 | 
31 | alias si='spark-init'
32 | alias si-list='spark-init -list'
33 | alias si-start-hm='spark-init startHdfsMaster'
34 | alias si-start-hw='spark-init startHdfsWorkers'
35 | alias si-start-sm='spark-init startSparkMaster'
36 | alias si-start-sw='spark-init startSparkWorkers'
37 | alias si-stop-hm='spark-init stopHdfsMaster'
38 | alias si-stop-hw='spark-init stopHdfsWorkers'
39 | alias si-stop-sm='spark-init stopSparkMaster'
40 | alias si-stop-sw='spark-init stopSparkWorkers'
41 | 
42 | # Application deployment shell aliases/functions.
43 | function spark-submit() {
44 |   fab -f $DEPLOY_DIR/application-deployment-fabfile.py $*
45 | }
46 | 
47 | alias ss='spark-submit'
48 | alias ss-list='spark-submit -list'
49 | alias ss-sy='spark-submit sync'
50 | alias ss-st='spark-submit start'
51 | alias ss-a='spark-submit assembly'
52 | alias ss-ss='spark-submit sync start'
53 | alias ss-o='spark-submit getOutput'
54 | alias ss-k='spark-submit kill'
55 | 


--------------------------------------------------------------------------------
/images/application-deployment-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/application-deployment-1.png


--------------------------------------------------------------------------------
/images/initial-deployment-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-1.png


--------------------------------------------------------------------------------
/images/initial-deployment-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-2.png


--------------------------------------------------------------------------------
/images/initial-deployment-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-3.png


--------------------------------------------------------------------------------
/initial-deployment-puppet/manifests/adobe_hadoop.pp.tmpl:
--------------------------------------------------------------------------------
 1 | class adobe::hadoop_base {
 2 |   class { 'cdh4::hadoop':
 3 |     namenode_hosts     => ['namenode_server'],
 4 |     datanode_mounts    => [
 5 |       '/raid/hadoop/data'
 6 |     ],
 7 |     dfs_name_dir => '/raid/hadoop/name'
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/manifests/hdfs-master.pp:
--------------------------------------------------------------------------------
1 | import "adobe_hadoop"
2 | 
3 | node default {
4 |   include adobe::hadoop_base
5 |   include cdh4::hadoop::master
6 | }
7 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/manifests/hdfs-worker.pp:
--------------------------------------------------------------------------------
1 | import "adobe_hadoop"
2 | 
3 | node default {
4 |   include adobe::hadoop_base
5 |   include cdh4::hadoop::worker
6 | }
7 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/manifests/spark-master.pp:
--------------------------------------------------------------------------------
1 | node default {
2 |   class { 'spark::master':
3 |     worker_mem => '22g'
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/manifests/spark-worker.pp.tmpl:
--------------------------------------------------------------------------------
1 | node default {
2 |   class { 'spark::worker':
3 |     master => 'namenode_server',
4 |     memory => '22g',
5 |     scratch_dir => "/raid/spark-work"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/.gitreview:
--------------------------------------------------------------------------------
1 | [gerrit]
2 | host=gerrit.wikimedia.org
3 | port=29418
4 | project=operations/puppet/cdh4.git
5 | defaultbranch=master
6 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | ===============
 3 | 
 4 | Copyright (c) 2013 Andrew Otto <otto@wikimedia.org>, the Wikimedia Foundation.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/TODO.md:
--------------------------------------------------------------------------------
 1 | **Table of Contents**  *generated with [DocToc](http://doctoc.herokuapp.com/)*
 2 | 
 3 | - [TODO:](#todo)
 4 |     - [Hadoop](#hadoop)
 5 |     - [HBase](#hbase)
 6 |     - [Zookeeper](#zookeeper)
 7 | 
 8 | # TODO:
 9 | 
10 | ## Hadoop
11 | 
12 | - Add hosts.exclude support for decommissioning nodes.
13 | - Change cluster (conf) name?  (use update-alternatives?)
14 | - Set default # map/reduce tasks automatically based on facter node stats.
15 | - Handle ensure => absent, especially for MRv1 vs YARN packages and services.
16 | - Implement standalone yarn proxyserver support.
17 | - Make log4j.properties more configurable.
18 | - Support Secondary NameNode.
19 | - Make JMX ports configurable.
20 | - Make hadoop-metrics2.properties more configurable.
21 | - Support HA automatic failover.
22 | - HA NameNode Fencing support.
23 | - Rename 'use_yarn' parameter to 'yarn_enabled' for consistency.
24 | 
25 | ## HBase
26 | - Implement.
27 | 
28 | ## Zookeeper
29 | 
30 | Won't implement. A Zookeeper package is available upstream in Debian/Ubuntu.
31 | Puppetization for this package can be found at
32 | https://github.com/wikimedia/operations-puppet-zookeeper
33 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/datanode.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::datanode
 2 | # Installs and starts up a Hadoop DataNode.
 3 | #
 4 | class cdh4::hadoop::datanode {
 5 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::datanode']
 6 | 
 7 |     # install jobtracker daemon package
 8 |     package { 'hadoop-hdfs-datanode':
 9 |         ensure => 'installed'
10 |     }
11 | 
12 |     # install datanode daemon package
13 |     service { 'hadoop-hdfs-datanode':
14 |         ensure     => 'running',
15 |         enable     => true,
16 |         hasstatus  => true,
17 |         hasrestart => true,
18 |         alias      => 'datanode',
19 |         require    => Package['hadoop-hdfs-datanode'],
20 |     }
21 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/defaults.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::defaults
 2 | # Default parameters for cdh4::hadoop configuration.
 3 | #
 4 | class cdh4::hadoop::defaults {
 5 |     $config_directory                         = '/etc/hadoop/conf'
 6 | 
 7 |     $nameservice_id                           = undef
 8 |     $journalnode_hosts                        = undef
 9 |     $dfs_journalnode_edits_dir                = undef
10 | 
11 |     $datanode_mounts                          = undef
12 |     $dfs_data_path                            = 'hdfs/dn'
13 |     $yarn_local_path                          = 'yarn/local'
14 |     $yarn_logs_path                           = 'yarn/logs'
15 |     $dfs_block_size                           = 67108864 # 64MB default
16 |     $enable_jmxremote                         = true
17 |     $enable_webhdfs                           = true
18 |     $mapreduce_system_dir                     = undef
19 |     $io_file_buffer_size                      = undef
20 |     $mapreduce_map_tasks_maximum              = undef
21 |     $mapreduce_reduce_tasks_maximum           = undef
22 |     $mapreduce_job_reuse_jvm_num_tasks        = undef
23 |     $mapreduce_reduce_shuffle_parallelcopies  = undef
24 |     $mapreduce_map_memory_mb                  = undef
25 |     $mapreduce_reduce_memory_mb               = undef
26 |     $mapreduce_task_io_sort_mb                = undef
27 |     $mapreduce_task_io_sort_factor            = undef
28 |     $mapreduce_map_java_opts                  = undef
29 |     $mapreduce_reduce_java_opts               = undef
30 |     $mapreduce_shuffle_port                   = undef
31 |     $mapreduce_intermediate_compression       = false
32 |     $mapreduce_intermediate_compression_codec = 'org.apache.hadoop.io.compress.DefaultCodec'
33 |     $mapreduce_output_compression             = false
34 |     $mapreduce_output_compression_codec       = 'org.apache.hadoop.io.compress.DefaultCodec'
35 |     $mapreduce_output_compression_type        = 'RECORD'
36 |     $yarn_nodemanager_resource_memory_mb      = undef
37 |     $yarn_resourcemanager_scheduler_class     = undef
38 |     $use_yarn                                 = true
39 |     $ganglia_hosts                            = undef
40 |     $net_topology_script_template             = undef
41 | 
42 | 
43 |     # JMX Ports (These are not currently configurable)
44 |     $namenode_jmxremote_port           = 9980
45 |     $datanode_jmxremote_port           = 9981
46 |     $resourcemanager_jmxremote_port    = 9983
47 |     $nodemanager_jmxremote_port        = 9984
48 |     $proxyserver_jmxremote_port        = 9985
49 | }
50 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/directory.pp:
--------------------------------------------------------------------------------
 1 | # == Define cdh4::hadoop::directory
 2 | #
 3 | # Creates or removes a directory in HDFS.
 4 | #
 5 | # == Notes:
 6 | # This will not check ownership and permissions
 7 | # of a directory.  It will only check for the directories
 8 | # existence.  If it does not exist, the directory will be
 9 | # created and given specified ownership and permissions.
10 | # This will not attempt to set ownership and permissions
11 | # if the directory already exists.
12 | #
13 | # This define does not support managing files in HDFS,
14 | # only directories.
15 | #
16 | # Ideally this define would be ported into a Puppet File Provider.
17 | # I once spent some time trying to make that work, but it was more
18 | # difficult than it sounds.  For example, you'd need to handle conversion
19 | # between symbolic mode to numeric mode, as I could not find a way to
20 | # get hadoop fs to list numeric modes for comparison.  Perhaps
21 | # there's a way to use HttpFS to do this instead?
22 | #
23 | # == Parameters:
24 | # $path   - HDFS directory path.   Default: $title
25 | # $ensure - present|absent.        Default: present
26 | # $owner  - HDFS directory owner.  Default: hdfs
27 | # $group  - HDFS directory group owner. Default: hdfs
28 | # $mode   - HDFS diretory mode.  Default 0755
29 | #
30 | define cdh4::hadoop::directory (
31 |     $path   = $title,
32 |     $ensure = 'present',
33 |     $owner  = 'hdfs',
34 |     $group  = 'hdfs',
35 |     $mode   = '0755')
36 | {
37 |     Class['cdh4::hadoop'] -> Cdh4::Hadoop::Directory[$title]
38 | 
39 |     if $ensure == 'present' {
40 |         exec { "cdh4::hadoop::directory ${title}":
41 |             command => "/usr/bin/hadoop fs -mkdir ${path} && /usr/bin/hadoop fs -chmod ${mode} ${path} && /usr/bin/hadoop fs -chown ${owner}:${group} ${path}",
42 |             unless  => "/usr/bin/hadoop fs -test -e ${path}",
43 |             user    => 'hdfs',
44 |         }
45 |     }
46 |     else {
47 |         exec { "cdh4::hadoop::directory ${title}":
48 |             command => "/usr/bin/hadoop fs -rm -R ${path}",
49 |             onlyif  => "/usr/bin/hadoop fs -test -e ${path}",
50 |             user    => 'hdfs',
51 |             require => Service['hadoop-hdfs-namenode'],
52 |         }
53 |     }
54 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/historyserver.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::historyserver
 2 | # Installs and starts up a Hadoop YARN HistoryServer.
 3 | # This will ensure that the HDFS /user/history exists.
 4 | # This class may only be included on the NameNode Master
 5 | # Hadoop node.
 6 | #
 7 | class cdh4::hadoop::historyserver {
 8 |     Class['cdh4::hadoop::namenode'] -> Class['cdh4::hadoop::historyserver']
 9 | 
10 |     if !$::cdh4::hadoop::use_yarn {
11 |         fail('Cannot use Hadoop YARN NodeManager if cdh4::hadoop::use_yarn is false.')
12 |     }
13 | 
14 |     # Create HistoryServer HDfS directories.
15 |     # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_11_4.html
16 |     cdh4::hadoop::directory { '/user/history':
17 |         # sudo -u hdfs hadoop fs -mkdir /user/history
18 |         # sudo -u hdfs hadoop fs -chmod -R 1777 /user/history
19 |         # sudo -u hdfs hadoop fs -chown yarn /user/history
20 |         owner   => 'yarn',
21 |         group   => 'hdfs',
22 |         mode    => '1777',
23 |         # Make sure HDFS directories are created before
24 |         # historyserver is installed and started, but after
25 |         # the namenode.
26 |         require => [Service['hadoop-hdfs-namenode'], Cdh4::Hadoop::Directory['/user']],
27 |     }
28 | 
29 |     package { 'hadoop-mapreduce-historyserver':
30 |         ensure  => 'installed',
31 |         require => Cdh4::Hadoop::Directory['/user/history'],
32 |     }
33 | 
34 |     service { 'hadoop-mapreduce-historyserver':
35 |         ensure     => 'running',
36 |         enable     => true,
37 |         hasstatus  => true,
38 |         hasrestart => true,
39 |         alias      => 'historyserver',
40 |         require    => Package['hadoop-mapreduce-historyserver'],
41 |     }
42 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/README.md:
--------------------------------------------------------------------------------
 1 | Hadoop very conveniently ships with built in Ganglia metrics reporter support.
 2 | However, the GangliaContext class uses DatagramSocket instead of MulticastSocket.
 3 | This will only work in Ganglia multicast setups where the there is no more than
 4 | 1 network hop needed to get to the ganglia aggretagor(s) for your multicast group.
 5 | See https://issues.apache.org/jira/browse/HADOOP-10181 for more details.
 6 | 
 7 | Wikimedia uses a multi row VLAN setup for its Hadoop nodes, and needs a way
 8 | to send Hadoop metrics to ganglia in a multicast setup.  Jmxtrans supports
 9 | this.  These jmxtrans classes can be included to send a particular Hadoop
10 | service's metrics to Ganglia.
11 | 
12 | # Usage
13 | 
14 | On your Hadoop master node:
15 | 
16 | ```puppet
17 | class { 'cdh4::hadoop::jmxtrans::master':
18 |     ganglia => 'ganglia.example.com',
19 | }
20 | ```
21 | 
22 | On your Hadoop worker nodes:
23 | ```puppet
24 | class { 'cdh4::hadoop::jmxtrans::worker':
25 |     ganglia => 'ganglia.example.com',
26 | }
27 | ```
28 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/master.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::jmxtrans::master
 2 | # Convenience class to include jmxtrans classes for NameNode and ResourceManager
 3 | class cdh4::hadoop::jmxtrans::master(
 4 |     $ganglia        = undef,
 5 |     $graphite       = undef,
 6 |     $outfile        = undef,
 7 | )
 8 | {
 9 |     class { ['cdh4::hadoop::jmxtrans::namenode', 'cdh4::hadoop::jmxtrans::resourcemanager']:
10 |         ganglia  => $ganglia,
11 |         graphite => $graphite,
12 |         outfile  => $outfile,
13 |     }
14 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/worker.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::jmxtrans::worker
 2 | # Convenience class to include jmxtrans classes for DataNode and NodeManager
 3 | class cdh4::hadoop::jmxtrans::worker(
 4 |     $ganglia        = undef,
 5 |     $graphite       = undef,
 6 |     $outfile        = undef,
 7 | )
 8 | {
 9 |     class { ['cdh4::hadoop::jmxtrans::datanode', 'cdh4::hadoop::jmxtrans::nodemanager']:
10 |         ganglia  => $ganglia,
11 |         graphite => $graphite,
12 |         outfile  => $outfile,
13 |     }
14 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/journalnode.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::journalnode
 2 | #
 3 | class cdh4::hadoop::journalnode {
 4 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::journalnode']
 5 | 
 6 |     # install jobtracker daemon package
 7 |     package { 'hadoop-hdfs-journalnode':
 8 |         ensure => 'installed'
 9 |     }
10 | 
11 |     # Ensure that the journanode edits directory has the correct permissions.
12 |     file { $::cdh4::hadoop::dfs_journalnode_edits_dir:
13 |         ensure  => 'directory',
14 |         owner   => 'hdfs',
15 |         group   => 'hdfs',
16 |         mode    => '0755',
17 |         require => Package['hadoop-hdfs-journalnode'],
18 |     }
19 | 
20 |     # install datanode daemon package
21 |     service { 'hadoop-hdfs-journalnode':
22 |         ensure     => 'running',
23 |         enable     => true,
24 |         hasstatus  => true,
25 |         hasrestart => true,
26 |         alias      => 'journalnode',
27 |         require    => File[$::cdh4::hadoop::dfs_journalnode_edits_dir],
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/master.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::master
 2 | # Wrapper class for Hadoop master node services:
 3 | # - NameNode
 4 | # - ResourceManager and HistoryServer (YARN)
 5 | # OR
 6 | # - JobTracker (MRv1).
 7 | #
 8 | class cdh4::hadoop::master {
 9 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::master']
10 | 
11 |     include cdh4::hadoop::namenode::primary
12 | 
13 |     # YARN uses ResourceManager and HistoryServer,
14 |     # NOT JobTracker.
15 |     if $::cdh4::hadoop::use_yarn {
16 |         include cdh4::hadoop::resourcemanager
17 |         include cdh4::hadoop::historyserver
18 |     }
19 |     # MRv1 just uses JobTracker
20 |     else {
21 |         include cdh4::hadoop::jobtracker
22 |     }
23 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/namenode.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::namenode
 2 | # Installs and configureds Hadoop NameNode.
 3 | # This will format the NameNode if it is not
 4 | # already formatted.  It will also create
 5 | # a common HDFS directory hierarchy.
 6 | #
 7 | # Note:  If you are using HA NameNode (indicated by setting
 8 | # cdh4::hadoop::nameservice_id), your JournalNodes should be running before
 9 | # this class is applied.
10 | #
11 | class cdh4::hadoop::namenode {
12 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::namenode']
13 | 
14 |     # install namenode daemon package
15 |     package { 'hadoop-hdfs-namenode':
16 |         ensure => installed
17 |     }
18 | 
19 |     file { "${::cdh4::hadoop::config_directory}/hosts.exclude":
20 |         ensure  => 'present',
21 |         require => Package['hadoop-hdfs-namenode'],
22 |     }
23 | 
24 |     # Ensure that the namenode directory has the correct permissions.
25 |     file { $::cdh4::hadoop::dfs_name_dir:
26 |         ensure  => 'directory',
27 |         owner   => 'hdfs',
28 |         group   => 'hdfs',
29 |         mode    => '0700',
30 |         require => Package['hadoop-hdfs-namenode'],
31 |     }
32 | 
33 |     # If $dfs_name_dir/current/VERSION doesn't exist, assume
34 |     # NameNode has not been formated.  Format it before
35 |     # the namenode service is started.
36 |     exec { 'hadoop-namenode-format':
37 |         command => '/usr/bin/hdfs namenode -format',
38 |         creates => "${::cdh4::hadoop::dfs_name_dir_main}/current/VERSION",
39 |         user    => 'hdfs',
40 |         require => File[$::cdh4::hadoop::dfs_name_dir],
41 |     }
42 | 
43 |     service { 'hadoop-hdfs-namenode':
44 |         ensure     => 'running',
45 |         enable     => true,
46 |         hasstatus  => true,
47 |         hasrestart => true,
48 |         alias      => 'namenode',
49 |         require    => [File["${::cdh4::hadoop::config_directory}/hosts.exclude"], Exec['hadoop-namenode-format']],
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/namenode/standby.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::namenode::standby
 2 | # Hadoop Standby NameNode.  Include this class instead of
 3 | # cdh4::hadoop::master on your HA standby NameNode(s).  This
 4 | # will bootstrap the standby dfs.name.dir with the contents
 5 | # from your primary active NameNode.
 6 | #
 7 | # See README.md for more documentation.
 8 | #
 9 | # NOTE: Your JournalNodes should be running before this class is applied.
10 | #
11 | class cdh4::hadoop::namenode::standby inherits cdh4::hadoop::namenode {
12 |     # Fail if nameservice_id isn't set.
13 |     if (!$::cdh4::hadoop::ha_enabled) {
14 |         fail('Cannot use Standby NameNode in a non HA setup.  Set $nameservice_id on the cdh4::hadoop class to enable HA.')
15 |     }
16 | 
17 |     # Override the namenode -format command to bootstrap this
18 |     # standby NameNode's dfs.name.dir with the data from the
19 |     # active NameNode.
20 |     Exec['hadoop-namenode-format'] {
21 |         command     => '/usr/bin/hdfs namenode -bootstrapStandby',
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/nodemanager.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::nodemanager
 2 | # Installs and configures a Hadoop NodeManager worker node.
 3 | #
 4 | class cdh4::hadoop::nodemanager {
 5 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::nodemanager']
 6 | 
 7 |     if !$::cdh4::hadoop::use_yarn {
 8 |         fail('Cannot use Hadoop YARN NodeManager if cdh4::hadoop::use_yarn is false.')
 9 |     }
10 | 
11 |     package { ['hadoop-yarn-nodemanager', 'hadoop-mapreduce']:
12 |         ensure => 'installed',
13 |     }
14 | 
15 |     # NodeManager (YARN TaskTracker)
16 |     service { 'hadoop-yarn-nodemanager':
17 |         ensure     => 'running',
18 |         enable     => true,
19 |         hasstatus  => true,
20 |         hasrestart => true,
21 |         alias      => 'nodemanager',
22 |         require    => [Package['hadoop-yarn-nodemanager', 'hadoop-mapreduce']],
23 |     }
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/resourcemanager.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::resourcemanager
 2 | # Installs and configures Hadoop YARN ResourceManager.
 3 | # This will create YARN HDFS directories.
 4 | #
 5 | class cdh4::hadoop::resourcemanager {
 6 |     Class['cdh4::hadoop::namenode'] -> Class['cdh4::hadoop::resourcemanager']
 7 | 
 8 |     if !$::cdh4::hadoop::use_yarn  {
 9 |         fail('Cannot use Hadoop YARN ResourceManager if cdh4::hadoop::use_yarn is false.')
10 |     }
11 | 
12 |     # Create YARN HDFS directories.
13 |     # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_11_4.html
14 |     cdh4::hadoop::directory { '/var/log/hadoop-yarn':
15 |         # sudo -u hdfs hadoop fs -mkdir /var/log/hadoop-yarn
16 |         # sudo -u hdfs hadoop fs -chown yarn:mapred /var/log/hadoop-yarn
17 |         owner   => 'yarn',
18 |         group   => 'mapred',
19 |         mode    => '0755',
20 |         # Make sure HDFS directories are created before
21 |         # resourcemanager is installed and started, but after
22 |         # the namenode.
23 |         require => [Service['hadoop-hdfs-namenode'], Cdh4::Hadoop::Directory['/var/log']],
24 |     }
25 | 
26 |     package { 'hadoop-yarn-resourcemanager':
27 |         ensure  => 'installed',
28 |         require => Cdh4::Hadoop::Directory['/var/log/hadoop-yarn'],
29 |     }
30 | 
31 |     service { 'hadoop-yarn-resourcemanager':
32 |         ensure     => 'running',
33 |         enable     => true,
34 |         hasstatus  => true,
35 |         hasrestart => true,
36 |         alias      => 'resourcemanager',
37 |         require    => Package['hadoop-yarn-resourcemanager'],
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/tasktracker.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::tasktracker
 2 | # Installs and configures Hadoop MRv1 TaskTracker.
 3 | class cdh4::hadoop::tasktracker {
 4 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::tasktracker']
 5 | 
 6 |     if $::cdh4::hadoop::use_yarn {
 7 |         fail('Cannot use Hadoop MRv1 JobTrackker if cdh4::hadoop::use_yarn is true.')
 8 |     }
 9 | 
10 |     # install tasktracker daemon package
11 |     package { 'hadoop-0.20-mapreduce-tasktracker':
12 |         ensure => 'installed'
13 |     }
14 | 
15 |     service { 'hadoop-0.20-mapreduce-tasktracker':
16 |         ensure     => 'running',
17 |         enable     => true,
18 |         hasstatus  => true,
19 |         hasrestart => true,
20 |         alias      => 'tasktracker',
21 |         require    => Package['hadoop-0.20-mapreduce-tasktracker'],
22 |     }
23 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hadoop/worker.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hadoop::worker
 2 | # Wrapper class for Hadoop Worker node services:
 3 | # - DataNode
 4 | # - NodeManager (YARN)
 5 | # OR
 6 | # - TaskTracker (MRv1)
 7 | #
 8 | # This class will attempt to create and manage the required
 9 | # local worker directories defined in the $datanode_mounts array.
10 | # You must make sure that the paths defined in $datanode_mounts are
11 | # formatted and mounted properly yourself; The CDH4 module does not
12 | # manage them.
13 | #
14 | class cdh4::hadoop::worker {
15 |     Class['cdh4::hadoop'] -> Class['cdh4::hadoop::worker']
16 | 
17 |     cdh4::hadoop::worker::paths { $::cdh4::hadoop::datanode_mounts: }
18 | 
19 |     class { 'cdh4::hadoop::datanode':
20 |         require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts],
21 |     }
22 | 
23 |     # YARN uses NodeManager.
24 |     if $::cdh4::hadoop::use_yarn {
25 |         class { 'cdh4::hadoop::nodemanager':
26 |             require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts],
27 |         }
28 |     }
29 |     # MRv1 uses TaskTracker.
30 |     else {
31 |         class { 'cdh4::hadoop::tasktracker':
32 |             require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts],
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hcatalog.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hcatalog
 2 | # This class doesn't yet do anything other than
 3 | # install the hcatalog package.  This will be expanded
 4 | # If/when we need more functionality (hcatalog-server, etc.),
 5 | #
 6 | class cdh4::hcatalog {
 7 |     package { 'hcatalog':
 8 |         ensure => 'installed',
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hive/defaults.pp:
--------------------------------------------------------------------------------
 1 | # == Class hive::defaults
 2 | # Default Hive configs
 3 | #
 4 | class cdh4::hive::defaults {
 5 |     $zookeeper_hosts             = undef
 6 | 
 7 |     $jdbc_driver                 = 'com.mysql.jdbc.Driver'
 8 |     $jdbc_protocol               = 'mysql'
 9 |     $jdbc_database               = 'hive_metastore'
10 |     $jdbc_host                   = 'localhost'
11 |     $jdbc_port                   = 3306
12 |     $jdbc_username               = 'hive'
13 |     $jdbc_password               = 'hive'
14 | 
15 |     $db_root_username            = undef
16 |     $db_root_password            = undef
17 | 
18 |     $exec_parallel_thread_number = 8  # set this to 0 to disable hive.exec.parallel
19 |     $optimize_skewjoin           = false
20 |     $skewjoin_key                = 10000
21 |     $skewjoin_mapjoin_map_tasks  = 10000
22 |     $skewjoin_mapjoin_min_split  = 33554432
23 | 
24 |     $stats_enabled               = false
25 |     $stats_dbclass               = 'jdbc:derby'
26 |     $stats_jdbcdriver            = 'org.apache.derby.jdbc.EmbeddedDriver'
27 |     $stats_dbconnectionstring    = 'jdbc:derby:;databaseName=TempStatsStore;create=true'
28 | 
29 |     # Default puppet paths to template config files.
30 |     # This allows us to use custom template config files
31 |     # if we want to override more settings than this
32 |     # module yet supports.
33 |     $hive_site_template          = 'cdh4/hive/hive-site.xml.erb'
34 |     $hive_exec_log4j_template    = 'cdh4/hive/hive-exec-log4j.properties.erb'
35 | 
36 |     # Further path/jar to add to hive's classpath
37 |     # (Until Hive 0.12.0 this can only be a single path (see HIVE-2269 )
38 |     $auxpath                     = undef
39 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hive/master.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hive::master
 2 | # Wrapper class for hive::server, hive::metastore, and hive::metastore::* databases.
 3 | #
 4 | # Include this class on your Hive master node with $metastore_database
 5 | # set to one of the available metastore backend classes in the hive/metastore/
 6 | # directory.  If you want to set up a hive metastore database backend that
 7 | # is not supported here, you may set $metastore_databse to undef.
 8 | #
 9 | # You must separately ensure that your $metastore_database (e.g. mysql) package
10 | # is installed.
11 | #
12 | # == Parameters
13 | # $metastore_database - Name of metastore database to use.  This should be
14 | #                       the name of a cdh4::hive::metastore::* class in
15 | #                       hive/metastore/*.pp.
16 | #
17 | class cdh4::hive::master($metastore_database = 'mysql') {
18 |     class { 'cdh4::hive::server':    }
19 |     class { 'cdh4::hive::metastore': }
20 | 
21 |     # Set up the metastore database by including
22 |     # the $metastore_database_class.
23 |     $metastore_database_class = "cdh4::hive::metastore::${metastore_database}"
24 |     if ($metastore_database) {
25 |         class { $metastore_database_class: }
26 |     }
27 | 
28 |     # Make sure the $metastore_database_class is included and set up
29 |     # before we start the hive-metastore service
30 |     Class[$metastore_database_class] -> Class['cdh4::hive::metastore']
31 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hive/metastore.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hive::metastore
 2 | #
 3 | class cdh4::hive::metastore
 4 | {
 5 |     Class['cdh4::hive'] -> Class['cdh4::hive::metastore']
 6 | 
 7 |     package { 'hive-metastore':
 8 |         ensure => 'installed',
 9 |     }
10 | 
11 |     service { 'hive-metastore':
12 |         ensure     => 'running',
13 |         require    => Package['hive-metastore'],
14 |         hasrestart => true,
15 |         hasstatus  => true,
16 |     }
17 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hive/server.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hive::server
 2 | # Configures hive-server2.  Requires that cdh4::hadoop is included so that
 3 | # hadoop-client is available to create hive HDFS directories.
 4 | #
 5 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_18_5.html
 6 | #
 7 | class cdh4::hive::server
 8 | {
 9 |     # cdh4::hive::server requires hadoop client and configs are installed.
10 |     Class['cdh4::hadoop'] -> Class['cdh4::hive::server']
11 |     Class['cdh4::hive']   -> Class['cdh4::hive::server']
12 | 
13 |     package { 'hive-server2':
14 |         ensure => 'installed',
15 |         alias  => 'hive-server',
16 |     }
17 | 
18 |     # sudo -u hdfs hadoop fs -mkdir /user/hive
19 |     # sudo -u hdfs hadoop fs -chmod 0775 /user/hive
20 |     # sudo -u hdfs hadoop fs -chown hive:hadoop /user/hive
21 |     cdh4::hadoop::directory { '/user/hive':
22 |         owner   => 'hive',
23 |         group   => 'hadoop',
24 |         mode    => '0775',
25 |         require => Package['hive'],
26 |     }
27 |     # sudo -u hdfs hadoop fs -mkdir /user/hive/warehouse
28 |     # sudo -u hdfs hadoop fs -chmod 1777 /user/hive/warehouse
29 |     # sudo -u hdfs hadoop fs -chown hive:hadoop /user/hive/warehouse
30 |     cdh4::hadoop::directory { '/user/hive/warehouse':
31 |         owner   => 'hive',
32 |         group   => 'hadoop',
33 |         mode    => '1777',
34 |         require => Cdh4::Hadoop::Directory['/user/hive'],
35 |     }
36 | 
37 |     service { 'hive-server2':
38 |         ensure     => 'running',
39 |         require    => Package['hive-server2'],
40 |         hasrestart => true,
41 |         hasstatus  => true,
42 |     }
43 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/hue/defaults.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::hue::defaults
 2 | #
 3 | class cdh4::hue::defaults {
 4 |     $http_host                = '0.0.0.0'
 5 |     $http_port                = 8888
 6 |     $secret_key               = undef
 7 | 
 8 |     # Set Hue Oozie defaults to those already
 9 |     # set in the cdh4::oozie class.
10 |     if (defined(Class['cdh4::oozie'])) {
11 |         $oozie_url                = $cdh4::oozie::url
12 |         # Is this the proper default values?  I'm not sure.
13 |         $oozie_security_enabled   = $cdh4::hue::defaults::oozie_security_enabled
14 |     }
15 |     # Otherwise disable Oozie interface for Hue.
16 |     else {
17 |         $oozie_url                = undef
18 |         $oozie_security_enabled   = undef
19 |     }
20 | 
21 |     $smtp_host                = 'localhost'
22 |     $smtp_port                = 25
23 |     $smtp_user                = undef
24 |     $smtp_password            = undef
25 |     $smtp_from_email          = undef
26 | 
27 |     $ssl_private_key          = '/etc/ssl/private/hue.key'
28 |     $ssl_certificate          = '/etc/ssl/certs/hue.cert'
29 | 
30 |     # if httpfs is enabled, the default httpfs port
31 |     # will be used, instead of the webhdfs port.
32 |     $httpfs_enabled           = false
33 | 
34 |     $ldap_url                 = undef
35 |     $ldap_cert                = undef
36 |     $ldap_nt_domain           = undef
37 |     $ldap_bind_dn             = undef
38 |     $ldap_base_dn             = undef
39 |     $ldap_bind_password       = undef
40 |     $ldap_username_pattern    = undef
41 |     $ldap_user_filter         = undef
42 |     $ldap_user_name_attr      = undef
43 |     $ldap_group_filter        = undef
44 |     $ldap_group_name_attr     = undef
45 |     $ldap_group_member_attr   = undef
46 | 
47 |     $hue_ini_template         = 'cdh4/hue/hue.ini.erb'
48 | 
49 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/oozie.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::oozie
 2 | # Installs the oozie-client package
 3 | # And sets OOZIE_URL in /etc/profile.d/oozie.sh.
 4 | #
 5 | class cdh4::oozie(
 6 |     $oozie_host = 'localhost'
 7 | )
 8 | {
 9 |     # oozie server url
10 |     $url = "http://$oozie_host:11000/oozie"
11 | 
12 |     package { 'oozie-client':
13 |         ensure => 'installed',
14 |     }
15 | 
16 |     # create a file in /etc/profile.d to export OOZIE_URL.
17 |     file { '/etc/profile.d/oozie.sh':
18 |         content => "# NOTE:  This file is managed by Puppet.
19 | 
20 | export OOZIE_URL='${url}'
21 | ",
22 |         mode    => '0444',
23 |     }
24 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/oozie/database/mysql.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::oozie::database::mysql
 2 | # Configures and sets up a MySQL database for Oozie.
 3 | #
 4 | # Note that this class does not support running
 5 | # the Oozie database on a different host than where your
 6 | # oozie server will run.  Permissions will only be granted
 7 | # for localhost MySQL users, so oozie server must run on this node.
 8 | #
 9 | # Also, root must be able to run /usr/bin/mysql with no password and have permissions
10 | # to create databases and users and grant permissions.
11 | #
12 | # You probably shouldn't be including this class directly.  Instead, include
13 | # cdh4::oozie::server with database => 'mysql'.
14 | #
15 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.1/CDH4-Installation-Guide/cdh4ig_topic_17_6.html
16 | #
17 | class cdh4::oozie::database::mysql {
18 |     if (!defined(Package['libmysql-java'])) {
19 |         package { 'libmysql-java':
20 |             ensure => 'installed',
21 |         }
22 |     }
23 | 
24 |     # symlink mysql.jar into /var/lib/oozie
25 |     file { '/var/lib/oozie/mysql.jar':
26 |         ensure  => 'link',
27 |         target  => '/usr/share/java/mysql.jar',
28 |         require => Package['libmysql-java'],
29 |     }
30 | 
31 |     $db_name = $cdh4::oozie::server::jdbc_database
32 |     $db_user = $cdh4::oozie::server::jdbc_username
33 |     $db_pass = $cdh4::oozie::server::jdbc_password
34 | 
35 |     # oozie is going to need an oozie database and user.
36 |     exec { 'oozie_mysql_create_database':
37 |         command => "/usr/bin/mysql -e \"
38 | CREATE DATABASE ${db_name};
39 | GRANT ALL PRIVILEGES ON ${db_name}.* TO '${db_user}'@'localhost' IDENTIFIED BY '${db_pass}';
40 | GRANT ALL PRIVILEGES ON ${db_name}.* TO '${db_user}'@'127.0.0.1' IDENTIFIED BY '${db_pass}';\"",
41 |         unless  => "/usr/bin/mysql -BNe 'SHOW DATABASES' | /bin/grep -q ${db_name}",
42 |         user    => 'root',
43 |     }
44 | 
45 |     # run ooziedb.sh to create the oozie database schema
46 |     exec { 'oozie_mysql_create_schema':
47 |         command => '/usr/lib/oozie/bin/ooziedb.sh create -run',
48 |         require => [Exec['oozie_mysql_create_database'], File['/var/lib/oozie/mysql.jar']],
49 |         unless  => "/usr/bin/mysql -u${db_user} -p'${db_pass}' ${db_name} -BNe 'SHOW TABLES;' | /bin/grep -q OOZIE_SYS",
50 |         user    => 'oozie',
51 |     }
52 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/oozie/defaults.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::oozie::defaults
 2 | #
 3 | class cdh4::oozie::defaults {
 4 |     $database                               = 'mysql'
 5 | 
 6 |     $jdbc_driver                            = 'com.mysql.jdbc.Driver'
 7 |     $jdbc_protocol                          = 'mysql'
 8 |     $jdbc_database                          = 'oozie'
 9 |     $jdbc_host                              = 'localhost'
10 |     $jdbc_port                              = 3306
11 |     $jdbc_username                          = 'oozie'
12 |     $jdbc_password                          = 'oozie'
13 | 
14 |     $smtp_host                              = undef
15 |     $smtp_port                              = 25
16 |     $smtp_from_email                        = undef
17 |     $smtp_username                          = undef
18 |     $smtp_password                          = undef
19 | 
20 |     $authorization_service_security_enabled = true
21 | 
22 |     # Default puppet paths to template config files.
23 |     # This allows us to use custom template config files
24 |     # if we want to override more settings than this
25 |     # module yet supports.
26 |     $oozie_site_template                    = 'cdh4/oozie/oozie-site.xml.erb'
27 |     $oozie_env_template                     = 'cdh4/oozie/oozie-env.sh.erb'
28 | }
29 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/pig.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::pig
 2 | #
 3 | # Installs and configures Apache Pig.
 4 | #
 5 | class cdh4::pig {
 6 |     package { 'pig':
 7 |         ensure => 'installed',
 8 |     }
 9 | 
10 |     file { '/etc/pig/conf/pig.properties':
11 |         content => template('cdh4/pig/pig.properties.erb'),
12 |         require => Package['pig'],
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/manifests/sqoop.pp:
--------------------------------------------------------------------------------
 1 | # == Class cdh4::sqoop
 2 | # Installs Sqoop
 3 | class cdh4::sqoop {
 4 |     package { 'sqoop':
 5 |         ensure => 'installed',
 6 |     }
 7 | 
 8 |     if (!defined(Package['libmysql-java'])) {
 9 |         package { 'libmysql-java':
10 |             ensure => 'installed',
11 |         }
12 |     }
13 |     # symlink the mysql-connector-java.jar that is installed by
14 |     # libmysql-java into /usr/lib/sqoop/lib
15 |     # TODO: Can I create this symlink as mysql.jar?
16 |     file { '/usr/lib/sqoop/lib/mysql-connector-java.jar':
17 |         ensure  => 'link',
18 |         target  => '/usr/share/java/mysql-connector-java.jar',
19 |         require => [Package['sqoop'], Package['libmysql-java']],
20 |     }
21 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/hadoop/core-site.xml.erb:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- NOTE:  This file is managed by Puppet. -->
 3 | 
 4 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 5 | 
 6 | <!-- Put site-specific property overrides in this file. -->
 7 | 
 8 | <configuration>
 9 | 
10 |   <property>
11 |     <name><%= @use_yarn ? 'fs.defaultFS' : 'fs.default.name' %></name>
12 |     <value>hdfs://<%= @ha_enabled ? @nameservice_id : @primary_namenode_host %>/</value>
13 |   </property>
14 | 
15 | <% if @io_file_buffer_size -%>
16 |   <property>
17 |     <name>io.file.buffer.size</name>
18 |     <value><%= io_file_buffer_size %></value>
19 |   </property>
20 | <% end -%>
21 | 
22 | <% if enable_webhdfs -%>
23 | <% # NOTE: There might be a better way to
24 |    # conditionally set this rather than relying
25 |    # the enable_webhdfs setting.  This will do for now.
26 | -%>
27 |   <!-- Hue WebHDFS proxy user -->
28 |   <property>
29 |     <name>hadoop.proxyuser.hue.hosts</name>
30 |     <value>*</value>
31 |   </property>
32 |   <property>
33 |     <name>hadoop.proxyuser.hue.groups</name>
34 |     <value>*</value>
35 |   </property>
36 | 
37 |   <!-- Oozie WebHDFS proxy user -->
38 |   <property>
39 |     <name>hadoop.proxyuser.oozie.hosts</name>
40 |     <value>*</value>
41 |   </property>
42 |   <property>
43 |     <name>hadoop.proxyuser.oozie.groups</name>
44 |     <value>*</value>
45 |   </property>
46 | <% end -%>
47 | 
48 | <% if @net_topology_script_template -%>
49 |   <!-- Script used to map nodes to rack or rows in datacenter. -->
50 |   <property>
51 |       <name>net.topology.script.file.name</name>
52 |       <value><%= @net_topology_script_path %></value>
53 |   </property>
54 | <% end -%>
55 | 
56 | </configuration>


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/hadoop/hadoop-env.sh.erb:
--------------------------------------------------------------------------------
 1 | # Note: This file is managed by Puppet.
 2 | 
 3 | <% if use_yarn == true -%>
 4 | # Use YARN for all hadoop commands
 5 | export HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce
 6 | <% else -%>
 7 | export HADOOP_MAPRED_HOME=/usr/lib/hadoop-0.20-mapreduce
 8 | <% end -%>
 9 | 
10 | <% if @namenode_jmxremote_port -%>
11 | # Enable NameNode JMX connections on port <%= namenode_jmxremote_port %>
12 | HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=<%= namenode_jmxremote_port %> -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false"
13 | <% end -%>
14 | 
15 | <% if @datanode_jmxremote_port -%>
16 | # Enable DateNode JMX connections on port <%= datanode_jmxremote_port %>
17 | HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote.port=<%= datanode_jmxremote_port %> -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false"
18 | <% end -%>
19 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/hadoop/hadoop-metrics2.properties.erb:
--------------------------------------------------------------------------------
 1 | # NOTE: This file is managed by Puppet.
 2 | 
 3 | # syntax: [prefix].[source|sink].[instance].[options]
 4 | # See javadoc of package-info.java for org.apache.hadoop.metrics2 for details
 5 | 
 6 | # default sampling period, in seconds
 7 | *.period=10
 8 | 
 9 | <% if @ganglia_hosts
10 | ganglia_hosts_string = ganglia_hosts.sort.join(',')
11 | -%>
12 | #
13 | # Below are for sending metrics to Ganglia
14 | #
15 | 
16 | # for Ganglia 3.1 support
17 | *.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31
18 | 
19 | *.sink.ganglia.period=10
20 | 
21 | # default for supportsparse is false
22 | # *.sink.ganglia.supportsparse=true
23 | 
24 | *.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both
25 | *.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40
26 | 
27 | namenode.sink.ganglia.servers=<%= ganglia_hosts_string %>
28 | datanode.sink.ganglia.servers=<%= ganglia_hosts_string %>
29 | 
30 | <% if use_yarn -%>
31 | resourcemanager.sink.ganglia.servers=<%= ganglia_hosts_string %>
32 | nodemanager.sink.ganglia.servers=<%= ganglia_hosts_string %>
33 | <% else -%>
34 | jobtracker.sink.ganglia.servers=<%= ganglia_hosts_string %>
35 | tasktracker.sink.ganglia.servers=<%= ganglia_hosts_string %>
36 | <% end -%>
37 | 
38 | maptask.sink.ganglia.servers=<%= ganglia_hosts_string %>
39 | reducetask.sink.ganglia.servers=<%= ganglia_hosts_string %>
40 | 
41 | secondarynamenode.sink.ganglia.servers=<%= ganglia_hosts_string %>
42 | 
43 | <% end -%>


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/hadoop/httpfs-site.xml.erb:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- NOTE:  This file is managed by Puppet. -->
 3 | 
 4 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 5 | 
 6 | <!-- Put site-specific property overrides in this file. -->
 7 | 
 8 | <configuration>
 9 | <% if enable_webhdfs -%>
10 | <% # NOTE: There might be a better way to
11 |    # conditionally set this rather than relying
12 |    # the enable_webhdfs setting.  This will do for now.
13 | -%>
14 |   <!-- Hue HttpFS proxy user setting -->
15 |   <property>
16 |     <name>httpfs.proxyuser.hue.hosts</name>
17 |     <value>*</value>
18 |   </property>
19 |   <property>
20 |     <name>httpfs.proxyuser.hue.groups</name>
21 |     <value>*</value>
22 |   </property>
23 |   
24 |   <!-- Oozie HttpFS proxy user -->
25 |   <property>
26 |     <name>httpfs.proxyuser.oozie.hosts</name>
27 |     <value>*</value>
28 |   </property>
29 |   <property>
30 |     <name>httpfs.proxyuser.oozie.groups</name>
31 |     <value>*</value>
32 |   </property>
33 | <% end -%>
34 | 
35 | </configuration>
36 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/hive/hive-exec-log4j.properties.erb:
--------------------------------------------------------------------------------
 1 | hive.log.threshold=INFO
 2 | hive.root.logger=INFO,RFA
 3 | hive.log.dir=/var/log/hive
 4 | hive.log.file=${hive.query.id}.log
 5 | 
 6 | # Define the root logger to the system property "hive.root.logger".
 7 | log4j.rootLogger=${hive.root.logger}, EventCounter
 8 | 
 9 | # Logging Threshold
10 | log4j.threshhold=${hive.log.threshold}
11 | 
12 | #
13 | # Rolling File Appender - cap space usage at 512MB
14 | #
15 | hive.log.maxfilesize=256MB
16 | hive.log.maxbackupindex=2
17 | log4j.appender.RFA=org.apache.log4j.RollingFileAppender
18 | log4j.appender.RFA.File=${hive.log.dir}/${hive.log.file}
19 | log4j.appender.RFA.MaxFileSize=${hive.log.maxfilesize}
20 | log4j.appender.RFA.MaxBackupIndex=${hive.log.maxbackupindex}
21 | log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
22 | # Pattern format: Date LogLevel LoggerName LogMessage
23 | log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
24 | 
25 | #
26 | # Event Counter Appender
27 | # Sends counts of logging messages at different severity levels to Hadoop Metrics.
28 | #
29 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
30 | 
31 | log4j.category.DataNucleus=ERROR,RFA
32 | log4j.category.Datastore=ERROR,RFA
33 | log4j.category.Datastore.Schema=ERROR,RFA
34 | log4j.category.JPOX.Datastore=ERROR,RFA
35 | log4j.category.JPOX.Plugin=ERROR,RFA
36 | log4j.category.JPOX.MetaData=ERROR,RFA
37 | log4j.category.JPOX.Query=ERROR,RFA
38 | log4j.category.JPOX.General=ERROR,RFA
39 | log4j.category.JPOX.Enhancer=ERROR,RFA
40 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/oozie/oozie-env.sh.erb:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | #  Note: This file is managed by Puppet.
 4 | 
 5 | export OOZIE_CONFIG=/etc/oozie/conf
 6 | export OOZIE_DATA=/var/lib/oozie
 7 | export OOZIE_LOG=/var/log/oozie
 8 | export OOZIE_CATALINA_HOME=/usr/lib/bigtop-tomcat
 9 | export CATALINA_TMPDIR=/var/lib/oozie
10 | export CATALINA_PID=/var/run/oozie/oozie.pid
11 | export CATALINA_BASE=<%= @catalina_base %>
12 | export CATALINA_OPTS=-Xmx1024m
13 | <%
14 | # This puppet module doesn't (yet) support HTTPS configuration.
15 | # These are the defaults that ship with CDH4.
16 | -%>
17 | export OOZIE_HTTPS_PORT=11443
18 | export OOZIE_HTTPS_KEYSTORE_PASS=password
19 | export CATALINA_OPTS="$CATALINA_OPTS -Doozie.https.port=${OOZIE_HTTPS_PORT}"
20 | export CATALINA_OPTS="$CATALINA_OPTS -Doozie.https.keystore.pass=${OOZIE_HTTPS_KEYSTORE_PASS}"
21 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/templates/pig/pig.properties.erb:
--------------------------------------------------------------------------------
 1 | # Pig configuration file. All values can be overwritten by command line arguments.
 2 | # see bin/pig -help
 3 | 
 4 | # log4jconf log4j configuration file
 5 | # log4jconf=./conf/log4j.properties
 6 | 
 7 | # brief logging (no timestamps)
 8 | brief=false
 9 | 
10 | # clustername, name of the hadoop jobtracker. If no port is defined port 50020 will be used. 
11 | #cluster
12 | 
13 | #debug level, INFO is default
14 | debug=INFO
15 | 
16 | # a file that contains pig script
17 | #file=
18 | 
19 | # load jarfile, colon separated
20 | #jar=
21 | 
22 | #verbose print all log messages to screen (default to print only INFO and above to screen)
23 | verbose=false
24 | 
25 | #exectype local|mapreduce, mapreduce is default
26 | #exectype=mapreduce
27 | # hod realted properties
28 | #ssh.gateway
29 | #hod.expect.root
30 | #hod.expect.uselatest
31 | #hod.command
32 | #hod.config.dir
33 | #hod.param
34 | 
35 | 
36 | #Do not spill temp files smaller than this size (bytes)
37 | pig.spill.size.threshold=5000000
38 | #EXPERIMENT: Activate garbage collection when spilling a file bigger than this size (bytes)
39 | #This should help reduce the number of files being spilled.
40 | pig.spill.gc.activation.size=40000000
41 | 
42 | 
43 | ######################
44 | # Everything below this line is Yahoo specific.  Note that I've made
45 | # (almost) no changes to the lines above to make merging in from Apache
46 | # easier.  Any values I don't want from above I override below.
47 | #
48 | # This file is configured for use with HOD on the production clusters.  If you
49 | # want to run pig with a static cluster you will need to remove everything
50 | # below this line and set the cluster value (above) to the
51 | # hostname and port of your job tracker.
52 | 
53 | exectype=mapreduce
54 | log.file=
55 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/Makefile:
--------------------------------------------------------------------------------
 1 | MANIFESTS=$(wildcard *.pp)
 2 | OBJS=$(MANIFESTS:.pp=.po)
 3 | TESTS_DIR=$(dir $(CURDIR))
 4 | MODULE_DIR=$(TESTS_DIR:/=)
 5 | MODULES_DIR=$(dir $(MODULE_DIR))
 6 | 
 7 | all:	test
 8 | 
 9 | test:	$(OBJS)
10 | 
11 | %.po:	%.pp
12 | 	puppet apply --noop --modulepath $(MODULES_DIR) $<


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/datanode.pp:
--------------------------------------------------------------------------------
1 | # 
2 | 
3 | class { '::cdh4::hadoop':
4 |       namenode_hosts => ['localhost'],
5 |       dfs_name_dir   => '/var/lib/hadoop/name',
6 | }
7 | 
8 | include cdh4::hadoop::datanode
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/defaults.pp:
--------------------------------------------------------------------------------
1 | # 
2 | include cdh4::hadoop::defaults
3 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hadoop.pp:
--------------------------------------------------------------------------------
1 | #
2 | 
3 | class { '::cdh4::hadoop':
4 |     namenode_hosts => ['localhost'],
5 |     dfs_name_dir   => '/var/lib/hadoop/name',
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/historyserver.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     namenode_hosts => ['localhost'],
 5 |     dfs_name_dir   => '/var/lib/hadoop/name',
 6 | }
 7 | 
 8 | # historyserver requires namenode
 9 | include cdh4::hadoop::master
10 | include cdh4::hadoop::historyserver
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hive.pp:
--------------------------------------------------------------------------------
1 | $fqdn = 'hive1.domain.org'
2 | class { 'cdh4::hive':
3 |     metastore_host  => $fqdn,
4 |     zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'],
5 |     jdbc_password   => 'test',
6 | }


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hive_master.pp:
--------------------------------------------------------------------------------
 1 | $fqdn = 'hive1.domain.org'
 2 | class { '::cdh4::hadoop':
 3 |     namenode_hosts => ['localhost'],
 4 |     dfs_name_dir   => '/var/lib/hadoop/name',
 5 | }
 6 | 
 7 | class { 'cdh4::hive':
 8 |     metastore_host  => $fqdn,
 9 |     zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'],
10 |     jdbc_password   => 'test',
11 | }
12 | class { 'cdh4::hive::master': }
13 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hive_metastore.pp:
--------------------------------------------------------------------------------
1 | $fqdn = 'hive1.domain.org'
2 | class { 'cdh4::hive':
3 |     metastore_host  => $fqdn,
4 |     zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'],
5 |     jdbc_password   => 'test',
6 | }
7 | class { 'cdh4::hive::metastore': }
8 | 
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hive_metastore_mysql.pp:
--------------------------------------------------------------------------------
1 | $fqdn = 'hive1.domain.org'
2 | class { 'cdh4::hive':
3 |     metastore_host  => $fqdn,
4 |     zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'],
5 |     jdbc_password   => 'test',
6 | }
7 | class { 'cdh4::hive::metastore::mysql': }
8 | 
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/hive_server.pp:
--------------------------------------------------------------------------------
 1 | $fqdn = 'hive1.domain.org'
 2 | class { '::cdh4::hadoop':
 3 |     namenode_hosts => ['localhost'],
 4 |     dfs_name_dir   => '/var/lib/hadoop/name',
 5 | }
 6 | 
 7 | class { 'cdh4::hive':
 8 |     metastore_host  => $fqdn,
 9 |     zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'],
10 |     jdbc_password   => 'test',
11 | }
12 | class { 'cdh4::hive::server': }
13 | 
14 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/jobtracker.pp:
--------------------------------------------------------------------------------
 1 | 
 2 | class { '::cdh4::hadoop':
 3 |     use_yarn       => false,
 4 |     namenode_hosts => ['localhost'],
 5 |     dfs_name_dir   => '/var/lib/hadoop/name',
 6 | }
 7 | 
 8 | # jobtracker requires namenode
 9 | include cdh4::hadoop::master
10 | include cdh4::hadoop::jobtracker
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/master.pp:
--------------------------------------------------------------------------------
1 | #
2 | 
3 | class { '::cdh4::hadoop':
4 |     namenode_hosts => ['localhost'],
5 |     dfs_name_dir   => '/var/lib/hadoop/name',
6 | }
7 | 
8 | include cdh4::hadoop::master
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/namenode.pp:
--------------------------------------------------------------------------------
1 | #
2 | 
3 | class { '::cdh4::hadoop':
4 |     namenode_hosts => ['localhost'],
5 |     dfs_name_dir   => '/var/lib/hadoop/name',
6 | }
7 | 
8 | include cdh4::hadoop::namenode
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/namenode_primary.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     namenode_hosts    => ['localhost', 'nonya'],
 5 |     dfs_name_dir      => '/var/lib/hadoop/name',
 6 |     nameservice_id    => 'test-cdh4',
 7 |     journalnode_hosts => ['localhost', 'nonya'],
 8 | }
 9 | 
10 | include cdh4::hadoop::namenode::primary
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/namenode_standby.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     namenode_hosts    => ['localhost', 'nonya'],
 5 |     dfs_name_dir      => '/var/lib/hadoop/name',
 6 |     nameservice_id    => 'test-cdh4',
 7 |     journalnode_hosts => ['localhost', 'nonya'],
 8 | }
 9 | 
10 | include cdh4::hadoop::namenode::standby
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/nodemanager.pp:
--------------------------------------------------------------------------------
1 | #
2 | 
3 | class { '::cdh4::hadoop':
4 |     namenode_hosts => ['localhost'],
5 |     dfs_name_dir   => '/var/lib/hadoop/name',
6 | }
7 | 
8 | include cdh4::hadoop::nodemanager
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/pig.pp:
--------------------------------------------------------------------------------
1 | include cdh4::pig


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/resourcemanager.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     namenode_hosts => ['localhost'],
 5 |     dfs_name_dir   => '/var/lib/hadoop/name',
 6 | }
 7 | 
 8 | # resourcemanager requires namenode
 9 | include cdh4::hadoop::master
10 | include cdh4::hadoop::resourcemanager
11 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/sqoop.pp:
--------------------------------------------------------------------------------
1 | include cdh4::sqoop


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/tasktracker.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     use_yarn       => false,
 5 |     namenode_hosts => ['localhost'],
 6 |     dfs_name_dir   => '/var/lib/hadoop/name',
 7 | }
 8 | 
 9 | include cdh4::hadoop::tasktracker
10 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/cdh4/tests/worker.pp:
--------------------------------------------------------------------------------
 1 | #
 2 | 
 3 | class { '::cdh4::hadoop':
 4 |     namenode_hosts  => 'localhost',
 5 |     dfs_name_dir    => '/var/lib/hadoop/name',
 6 |     datanode_mounts => '/tmp',
 7 | }
 8 | 
 9 | include cdh4::hadoop::worker
10 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Stefan van Wouw
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/README.md:
--------------------------------------------------------------------------------
 1 | # Puppet module for Spark (0.9.0)
 2 | 
 3 | Puppet module to install Spark (0.9.0) on your Hadoop cluster.
 4 | 
 5 | 
 6 | Unfortunately no Debian packages are available for Spark, and the pre-compiled Spark versions are not compatible with CDH 4.4.0. 
 7 | Therefore I built the Spark incubator version 0.9.0 and included the entire dist directory in the puppet module.
 8 | 
 9 | If you want to deploy another version of Spark use the following code to compile (e.g. older Spark 0.8.0):
10 | 
11 | 
12 | ```bash
13 | wget https://github.com/apache/incubator-spark/archive/v0.8.0-incubating.tar.gz
14 | tar xvf v0.8.0-incubating.tar.gz
15 | cd incubator-spark-0.8.0-incubating/
16 | ./make-distribution.sh --hadoop 2.0.0-cdh4.4.0
17 | cp conf/log4j.properties.template dist/conf/log4j.properties
18 | 
19 | # Replace the standard distribution with the one you just compiled:
20 | rm -rf /etc/puppet/modules/spark/files/spark
21 | cp -r dist /etc/puppet/modules/spark/files/spark
22 | 
23 | ```
24 | 
25 | *Note: Spark 0.8.0 does not compile with YARN enabled against YARN CDH4.4.0.*
26 | 
27 | 
28 | ### Dependencies not made explicit in the module itself:
29 | 
30 | 
31 | - Oracle Java 6 (7 for Spark 0.9.0+) installed on all nodes (requirement of Spark).
32 | - Apache HDFS should be installed (The CDH4 versions included in: https://github.com/wikimedia/puppet-cdh4 ).
33 | - OS should be Ubuntu/Debian for package dependencies.
34 | 
35 | ### Usage:
36 | 
37 | 
38 | On the master node:
39 | ```puppet
40 | class {'spark::master':
41 |     worker_mem => 'worker memory e.g. 60g',
42 |     require => [
43 |         Class['your::class::that::ensures::java::is::installed'], 
44 |         Class['cdh4::hadoop']
45 |     ],
46 | }
47 | ```
48 | 
49 | On the worker nodes:
50 | ```puppet
51 | class {'spark::worker':
52 |     master => $master_fqdn,
53 |     memory => 'worker memory e.g. 60g',
54 |     require => [
55 |         Class['your::class::that::ensures::java::is::installed'], 
56 |         Class['cdh4::hadoop']
57 |     ],
58 | }
59 | ```
60 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/RELEASE:
--------------------------------------------------------------------------------
1 | Spark 1.0.0 built for Hadoop 2.0.0-cdh4.7.0
2 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/load-spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This script loads spark-env.sh if it exists, and ensures it is only loaded once.
21 | # spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
22 | # conf/ subdirectory.
23 | 
24 | if [ -z "$SPARK_ENV_LOADED" ]; then
25 |   export SPARK_ENV_LOADED=1
26 | 
27 |   # Returns the parent of the directory this script lives in.
28 |   parent_dir="$(cd `dirname $0`/..; pwd)"
29 | 
30 |   use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
31 | 
32 |   if [ -f "${use_conf_dir}/spark-env.sh" ]; then
33 |     # Promote all variable declarations to environment (exported) variables
34 |     set -a
35 |     . "${use_conf_dir}/spark-env.sh"
36 |     set +a
37 |   fi
38 | fi
39 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/pyspark.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | rem This is the entry point for running PySpark. To avoid polluting the
21 | rem environment, it just launches a new cmd to do the real work.
22 | 
23 | cmd /V /E /C %~dp0pyspark2.cmd %*
24 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/pyspark2.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | set SCALA_VERSION=2.10
21 | 
22 | rem Figure out where the Spark framework is installed
23 | set FWDIR=%~dp0..\
24 | 
25 | rem Export this as SPARK_HOME
26 | set SPARK_HOME=%FWDIR%
27 | 
28 | rem Test whether the user has built Spark
29 | if exist "%FWDIR%RELEASE" goto skip_build_test
30 | set FOUND_JAR=0
31 | for %%d in ("%FWDIR%assembly\target\scala-%SCALA_VERSION%\spark-assembly*hadoop*.jar") do (
32 |   set FOUND_JAR=1
33 | )
34 | if [%FOUND_JAR%] == [0] (
35 |   echo Failed to find Spark assembly JAR.
36 |   echo You need to build Spark with sbt\sbt assembly before running this program.
37 |   goto exit
38 | )
39 | :skip_build_test
40 | 
41 | rem Load environment variables from conf\spark-env.cmd, if it exists
42 | if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"
43 | 
44 | rem Figure out which Python to use.
45 | if [%PYSPARK_PYTHON%] == [] set PYSPARK_PYTHON=python
46 | 
47 | set PYTHONPATH=%FWDIR%python;%PYTHONPATH%
48 | set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH%
49 | 
50 | set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
51 | set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py
52 | set PYSPARK_SUBMIT_ARGS=%*
53 | 
54 | echo Running %PYSPARK_PYTHON% with PYTHONPATH=%PYTHONPATH%
55 | 
56 | rem Check whether the argument is a file
57 | for /f %%i in ('echo %1^| findstr /R "\.py"') do (
58 |   set PYTHON_FILE=%%i
59 | )
60 | 
61 | if [%PYTHON_FILE%] == [] (
62 |   %PYSPARK_PYTHON%
63 | ) else (
64 |   echo.
65 |   echo WARNING: Running python applications through ./bin/pyspark.cmd is deprecated as of Spark 1.0.
66 |   echo Use ./bin/spark-submit ^<python file^>
67 |   echo.
68 |   "%FWDIR%\bin\spark-submit.cmd" %PYSPARK_SUBMIT_ARGS%
69 | )
70 | 
71 | :exit
72 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/run-example:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | SCALA_VERSION=2.10
21 | 
22 | FWDIR="$(cd `dirname $0`/..; pwd)"
23 | export SPARK_HOME="$FWDIR"
24 | EXAMPLES_DIR="$FWDIR"/examples
25 | 
26 | if [ -n "$1" ]; then
27 |   EXAMPLE_CLASS="$1"
28 |   shift
29 | else
30 |   echo "Usage: ./bin/run-example <example-class> [example-args]"
31 |   echo "  - set MASTER=XX to use a specific master"
32 |   echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)"
33 |   exit 1
34 | fi
35 | 
36 | if [ -f "$FWDIR/RELEASE" ]; then
37 |   export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar`
38 | elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then
39 |   export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar`
40 | fi
41 | 
42 | if [[ -z $SPARK_EXAMPLES_JAR ]]; then
43 |   echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2
44 |   echo "You need to build Spark before running this program" >&2
45 |   exit 1
46 | fi
47 | 
48 | EXAMPLE_MASTER=${MASTER:-"local[*]"}
49 | 
50 | if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
51 |   EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
52 | fi
53 | 
54 | ./bin/spark-submit \
55 |   --master $EXAMPLE_MASTER \
56 |   --class $EXAMPLE_CLASS \
57 |   "$SPARK_EXAMPLES_JAR" \
58 |   "$@"
59 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/run-example.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | rem This is the entry point for running a Spark example. To avoid polluting
21 | rem the environment, it just launches a new cmd to do the real work.
22 | 
23 | cmd /V /E /C %~dp0run-example2.cmd %*
24 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/spark-class.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | rem This is the entry point for running a Spark class. To avoid polluting
21 | rem the environment, it just launches a new cmd to do the real work.
22 | 
23 | cmd /V /E /C %~dp0spark-class2.cmd %*
24 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/spark-shell.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | set SPARK_HOME=%~dp0..
21 | 
22 | cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell %* --class org.apache.spark.repl.Main
23 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/spark-submit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
21 | ORIG_ARGS=("$@")
22 | 
23 | while (($#)); do
24 |   if [ "$1" = "--deploy-mode" ]; then
25 |     DEPLOY_MODE=$2
26 |   elif [ "$1" = "--driver-memory" ]; then
27 |     DRIVER_MEMORY=$2
28 |   elif [ "$1" = "--driver-library-path" ]; then
29 |     export SPARK_SUBMIT_LIBRARY_PATH=$2
30 |   elif [ "$1" = "--driver-class-path" ]; then
31 |     export SPARK_SUBMIT_CLASSPATH=$2
32 |   elif [ "$1" = "--driver-java-options" ]; then
33 |     export SPARK_SUBMIT_OPTS=$2
34 |   fi
35 |   shift
36 | done
37 | 
38 | DEPLOY_MODE=${DEPLOY_MODE:-"client"}
39 | 
40 | if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
41 |   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
42 | fi
43 | 
44 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
45 | 
46 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/bin/spark-submit.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem
 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
 5 | rem contributor license agreements.  See the NOTICE file distributed with
 6 | rem this work for additional information regarding copyright ownership.
 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
 8 | rem (the "License"); you may not use this file except in compliance with
 9 | rem the License.  You may obtain a copy of the License at
10 | rem
11 | rem    http://www.apache.org/licenses/LICENSE-2.0
12 | rem
13 | rem Unless required by applicable law or agreed to in writing, software
14 | rem distributed under the License is distributed on an "AS IS" BASIS,
15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | rem See the License for the specific language governing permissions and
17 | rem limitations under the License.
18 | rem
19 | 
20 | set SPARK_HOME=%~dp0..
21 | set ORIG_ARGS=%*
22 | 
23 | rem Clear the values of all variables used
24 | set DEPLOY_MODE=
25 | set DRIVER_MEMORY=
26 | set SPARK_SUBMIT_LIBRARY_PATH=
27 | set SPARK_SUBMIT_CLASSPATH=
28 | set SPARK_SUBMIT_OPTS=
29 | set SPARK_DRIVER_MEMORY=
30 | 
31 | :loop
32 | if [%1] == [] goto continue
33 |   if [%1] == [--deploy-mode] (
34 |     set DEPLOY_MODE=%2
35 |   ) else if [%1] == [--driver-memory] (
36 |     set DRIVER_MEMORY=%2
37 |   ) else if [%1] == [--driver-library-path] (
38 |     set SPARK_SUBMIT_LIBRARY_PATH=%2
39 |   ) else if [%1] == [--driver-class-path] (
40 |     set SPARK_SUBMIT_CLASSPATH=%2
41 |   ) else if [%1] == [--driver-java-options] (
42 |     set SPARK_SUBMIT_OPTS=%2
43 |   )
44 |   shift
45 | goto loop
46 | :continue
47 | 
48 | if [%DEPLOY_MODE%] == [] (
49 |   set DEPLOY_MODE=client
50 | )
51 | 
52 | if not [%DRIVER_MEMORY%] == [] if [%DEPLOY_MODE%] == [client] (
53 |   set SPARK_DRIVER_MEMORY=%DRIVER_MEMORY%
54 | )
55 | 
56 | cmd /V /E /C %SPARK_HOME%\bin\spark-class.cmd org.apache.spark.deploy.SparkSubmit %ORIG_ARGS%
57 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/conf/fairscheduler.xml.template:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <allocations>
 3 |   <pool name="production">
 4 |     <schedulingMode>FAIR</schedulingMode>
 5 |     <weight>1</weight>
 6 |     <minShare>2</minShare>
 7 |   </pool>
 8 |   <pool name="test">
 9 |     <schedulingMode>FIFO</schedulingMode>
10 |     <weight>2</weight>
11 |     <minShare>3</minShare>
12 |   </pool>
13 | </allocations>
14 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/conf/log4j.properties.template:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=INFO, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/conf/slaves:
--------------------------------------------------------------------------------
1 | # A Spark Worker will be started on each of the machines listed below.
2 | localhost


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/conf/spark-defaults.conf.template:
--------------------------------------------------------------------------------
1 | # Default system properties included when running spark-submit.
2 | # This is useful for setting default environmental settings.
3 | 
4 | # Example:
5 | # spark.master            spark://master:7077
6 | # spark.eventLog.enabled  true
7 | # spark.eventLog.dir      hdfs://namenode:8021/directory
8 | # spark.serializer        org.apache.spark.serializer.KryoSerializer
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/ec2/README:
--------------------------------------------------------------------------------
1 | This folder contains a script, spark-ec2, for launching Spark clusters on
2 | Amazon EC2. Usage instructions are available online at:
3 | 
4 | http://spark.apache.org/docs/latest/ec2-scripts.html
5 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # These variables are automatically filled in by the spark-ec2 script.
21 | export MASTERS="{{master_list}}"
22 | export SLAVES="{{slave_list}}"
23 | export HDFS_DATA_DIRS="{{hdfs_data_dirs}}"
24 | export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}"
25 | export SPARK_LOCAL_DIRS="{{spark_local_dirs}}"
26 | export MODULES="{{modules}}"
27 | export SPARK_VERSION="{{spark_version}}"
28 | export SHARK_VERSION="{{shark_version}}"
29 | export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
30 | export SWAP_MB="{{swap}}"
31 | export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
32 | export SPARK_MASTER_OPTS="{{spark_master_opts}}"
33 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/ec2/spark-ec2:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one
 5 | # or more contributor license agreements.  See the NOTICE file
 6 | # distributed with this work for additional information
 7 | # regarding copyright ownership.  The ASF licenses this file
 8 | # to you under the Apache License, Version 2.0 (the
 9 | # "License"); you may not use this file except in compliance
10 | # with the License.  You may obtain a copy of the License at
11 | # 
12 | #     http://www.apache.org/licenses/LICENSE-2.0
13 | # 
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | 
21 | cd "`dirname $0`"
22 | PYTHONPATH="./third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" python ./spark_ec2.py $@
23 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/ec2/third_party/boto-2.4.1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/ec2/third_party/boto-2.4.1.zip


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples;
19 | 
20 | import org.apache.spark.SparkConf;
21 | import org.apache.spark.api.java.JavaRDD;
22 | import org.apache.spark.api.java.JavaSparkContext;
23 | import org.apache.spark.api.java.function.Function;
24 | import org.apache.spark.api.java.function.Function2;
25 | 
26 | import java.util.ArrayList;
27 | import java.util.List;
28 | 
29 | /** 
30 |  * Computes an approximation to pi
31 |  * Usage: JavaSparkPi [slices]
32 |  */
33 | public final class JavaSparkPi {
34 |   
35 | 
36 |   public static void main(String[] args) throws Exception {
37 |     SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
38 |     JavaSparkContext jsc = new JavaSparkContext(sparkConf);
39 | 
40 |     int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
41 |     int n = 100000 * slices;
42 |     List<Integer> l = new ArrayList<Integer>(n);
43 |     for (int i = 0; i < n; i++) {
44 |       l.add(i);
45 |     }
46 | 
47 |     JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
48 | 
49 |     int count = dataSet.map(new Function<Integer, Integer>() {
50 |       @Override
51 |       public Integer call(Integer integer) {
52 |         double x = Math.random() * 2 - 1;
53 |         double y = Math.random() * 2 - 1;
54 |         return (x * x + y * y < 1) ? 1 : 0;
55 |       }
56 |     }).reduce(new Function2<Integer, Integer, Integer>() {
57 |       @Override
58 |       public Integer call(Integer integer, Integer integer2) {
59 |         return integer + integer2;
60 |       }
61 |     });
62 | 
63 |     System.out.println("Pi is roughly " + 4.0 * count / n);
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/kmeans.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | The K-means algorithm written from scratch against PySpark. In practice,
20 | one may prefer to use the KMeans algorithm in MLlib, as shown in
21 | examples/src/main/python/mllib/kmeans.py.
22 | 
23 | This example requires NumPy (http://www.numpy.org/).
24 | """
25 | 
26 | import sys
27 | 
28 | import numpy as np
29 | from pyspark import SparkContext
30 | 
31 | 
32 | def parseVector(line):
33 |     return np.array([float(x) for x in line.split(' ')])
34 | 
35 | 
36 | def closestPoint(p, centers):
37 |     bestIndex = 0
38 |     closest = float("+inf")
39 |     for i in range(len(centers)):
40 |         tempDist = np.sum((p - centers[i]) ** 2)
41 |         if tempDist < closest:
42 |             closest = tempDist
43 |             bestIndex = i
44 |     return bestIndex
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     if len(sys.argv) != 4:
49 |         print >> sys.stderr, "Usage: kmeans <file> <k> <convergeDist>"
50 |         exit(-1)
51 |     sc = SparkContext(appName="PythonKMeans")
52 |     lines = sc.textFile(sys.argv[1])
53 |     data = lines.map(parseVector).cache()
54 |     K = int(sys.argv[2])
55 |     convergeDist = float(sys.argv[3])
56 | 
57 |     kPoints = data.takeSample(False, K, 1)
58 |     tempDist = 1.0
59 | 
60 |     while tempDist > convergeDist:
61 |         closest = data.map(
62 |             lambda p: (closestPoint(p, kPoints), (p, 1)))
63 |         pointStats = closest.reduceByKey(
64 |             lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2))
65 |         newPoints = pointStats.map(
66 |             lambda (x, (y, z)): (x, y / z)).collect()
67 | 
68 |         tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
69 | 
70 |         for (x, y) in newPoints:
71 |             kPoints[x] = y
72 | 
73 |     print "Final centers: " + str(kPoints)
74 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/mllib/kmeans.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | A K-means clustering program using MLlib.
20 | 
21 | This example requires NumPy (http://www.numpy.org/).
22 | """
23 | 
24 | import sys
25 | 
26 | import numpy as np
27 | from pyspark import SparkContext
28 | from pyspark.mllib.clustering import KMeans
29 | 
30 | 
31 | def parseVector(line):
32 |     return np.array([float(x) for x in line.split(' ')])
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     if len(sys.argv) != 3:
37 |         print >> sys.stderr, "Usage: kmeans <file> <k>"
38 |         exit(-1)
39 |     sc = SparkContext(appName="KMeans")
40 |     lines = sc.textFile(sys.argv[1])
41 |     data = lines.map(parseVector)
42 |     k = int(sys.argv[2])
43 |     model = KMeans.train(data, k)
44 |     print "Final centers: " + str(model.clusterCenters)
45 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/mllib/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | Logistic regression using MLlib.
20 | 
21 | This example requires NumPy (http://www.numpy.org/).
22 | """
23 | 
24 | from math import exp
25 | import sys
26 | 
27 | import numpy as np
28 | from pyspark import SparkContext
29 | from pyspark.mllib.regression import LabeledPoint
30 | from pyspark.mllib.classification import LogisticRegressionWithSGD
31 | 
32 | 
33 | # Parse a line of text into an MLlib LabeledPoint object
34 | def parsePoint(line):
35 |     values = [float(s) for s in line.split(' ')]
36 |     if values[0] == -1:   # Convert -1 labels to 0 for MLlib
37 |         values[0] = 0
38 |     return LabeledPoint(values[0], values[1:])
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     if len(sys.argv) != 3:
43 |         print >> sys.stderr, "Usage: logistic_regression <file> <iterations>"
44 |         exit(-1)
45 |     sc = SparkContext(appName="PythonLR")
46 |     points = sc.textFile(sys.argv[1]).map(parsePoint)
47 |     iterations = int(sys.argv[2])
48 |     model = LogisticRegressionWithSGD.train(points, iterations)
49 |     print "Final weights: " + str(model.weights)
50 |     print "Final intercept: " + str(model.intercept)
51 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/pagerank.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import re
19 | import sys
20 | from operator import add
21 | 
22 | from pyspark import SparkContext
23 | 
24 | 
25 | def computeContribs(urls, rank):
26 |     """Calculates URL contributions to the rank of other URLs."""
27 |     num_urls = len(urls)
28 |     for url in urls:
29 |         yield (url, rank / num_urls)
30 | 
31 | 
32 | def parseNeighbors(urls):
33 |     """Parses a urls pair string into urls pair."""
34 |     parts = re.split(r'\s+', urls)
35 |     return parts[0], parts[1]
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     if len(sys.argv) != 3:
40 |         print >> sys.stderr, "Usage: pagerank <file> <iterations>"
41 |         exit(-1)
42 | 
43 |     # Initialize the spark context.
44 |     sc = SparkContext(appName="PythonPageRank")
45 | 
46 |     # Loads in input file. It should be in format of:
47 |     #     URL         neighbor URL
48 |     #     URL         neighbor URL
49 |     #     URL         neighbor URL
50 |     #     ...
51 |     lines = sc.textFile(sys.argv[1], 1)
52 | 
53 |     # Loads all URLs from input file and initialize their neighbors.
54 |     links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()
55 | 
56 |     # Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
57 |     ranks = links.map(lambda (url, neighbors): (url, 1.0))
58 | 
59 |     # Calculates and updates URL ranks continuously using PageRank algorithm.
60 |     for iteration in xrange(int(sys.argv[2])):
61 |         # Calculates URL contributions to the rank of other URLs.
62 |         contribs = links.join(ranks).flatMap(
63 |             lambda (url, (urls, rank)): computeContribs(urls, rank))
64 | 
65 |         # Re-calculates URL ranks based on neighbor contributions.
66 |         ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)
67 | 
68 |     # Collects all URL ranks and dump them to console.
69 |     for (link, rank) in ranks.collect():
70 |         print "%s has rank: %s." % (link, rank)
71 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/pi.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | from random import random
20 | from operator import add
21 | 
22 | from pyspark import SparkContext
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     """
27 |         Usage: pi [slices]
28 |     """
29 |     sc = SparkContext(appName="PythonPi")
30 |     slices = int(sys.argv[1]) if len(sys.argv) > 1 else 2
31 |     n = 100000 * slices
32 | 
33 |     def f(_):
34 |         x = random() * 2 - 1
35 |         y = random() * 2 - 1
36 |         return 1 if x ** 2 + y ** 2 < 1 else 0
37 | 
38 |     count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
39 |     print "Pi is roughly %f" % (4.0 * count / n)
40 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/sort.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | 
20 | from pyspark import SparkContext
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     if len(sys.argv) != 2:
25 |         print >> sys.stderr, "Usage: sort <file>"
26 |         exit(-1)
27 |     sc = SparkContext(appName="PythonSort")
28 |     lines = sc.textFile(sys.argv[1], 1)
29 |     sortedCount = lines.flatMap(lambda x: x.split(' ')) \
30 |         .map(lambda x: (int(x), 1)) \
31 |         .sortByKey(lambda x: x)
32 |     # This is just a demo on how to bring all the sorted data back to a single node.
33 |     # In reality, we wouldn't want to collect all the data to the driver node.
34 |     output = sortedCount.collect()
35 |     for (num, unitcount) in output:
36 |         print num
37 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/transitive_closure.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | from random import Random
20 | 
21 | from pyspark import SparkContext
22 | 
23 | numEdges = 200
24 | numVertices = 100
25 | rand = Random(42)
26 | 
27 | 
28 | def generateGraph():
29 |     edges = set()
30 |     while len(edges) < numEdges:
31 |         src = rand.randrange(0, numEdges)
32 |         dst = rand.randrange(0, numEdges)
33 |         if src != dst:
34 |             edges.add((src, dst))
35 |     return edges
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     """
40 |     Usage: transitive_closure [slices]
41 |     """
42 |     sc = SparkContext(appName="PythonTransitiveClosure")
43 |     slices = int(sys.argv[1]) if len(sys.argv) > 1 else 2
44 |     tc = sc.parallelize(generateGraph(), slices).cache()
45 | 
46 |     # Linear transitive closure: each round grows paths by one edge,
47 |     # by joining the graph's edges with the already-discovered paths.
48 |     # e.g. join the path (y, z) from the TC with the edge (x, y) from
49 |     # the graph to obtain the path (x, z).
50 | 
51 |     # Because join() joins on keys, the edges are stored in reversed order.
52 |     edges = tc.map(lambda (x, y): (y, x))
53 | 
54 |     oldCount = 0L
55 |     nextCount = tc.count()
56 |     while True:
57 |         oldCount = nextCount
58 |         # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
59 |         # then project the result to obtain the new (x, z) paths.
60 |         new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a))
61 |         tc = tc.union(new_edges).distinct().cache()
62 |         nextCount = tc.count()
63 |         if nextCount == oldCount:
64 |             break
65 | 
66 |     print "TC has %i edges" % tc.count()
67 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/wordcount.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | from operator import add
20 | 
21 | from pyspark import SparkContext
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     if len(sys.argv) != 2:
26 |         print >> sys.stderr, "Usage: wordcount <file>"
27 |         exit(-1)
28 |     sc = SparkContext(appName="PythonWordCount")
29 |     lines = sc.textFile(sys.argv[1], 1)
30 |     counts = lines.flatMap(lambda x: x.split(' ')) \
31 |                   .map(lambda x: (x, 1)) \
32 |                   .reduceByKey(add)
33 |     output = counts.collect()
34 |     for (word, count) in output:
35 |         print "%s: %i" % (word, count)
36 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/resources/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19
4 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | /**
23 |   * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]
24 |   */
25 | object BroadcastTest {
26 |   def main(args: Array[String]) {
27 | 
28 |     val bcName = if (args.length > 2) args(2) else "Http"
29 |     val blockSize = if (args.length > 3) args(3) else "4096"
30 | 
31 |     System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
32 |       "BroadcastFactory")
33 |     System.setProperty("spark.broadcast.blockSize", blockSize)
34 |     val sparkConf = new SparkConf().setAppName("Broadcast Test")
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val slices = if (args.length > 0) args(0).toInt else 2
39 |     val num = if (args.length > 1) args(1).toInt else 1000000
40 | 
41 |     val arr1 = new Array[Int](num)
42 |     for (i <- 0 until arr1.length) {
43 |       arr1(i) = i
44 |     }
45 | 
46 |     for (i <- 0 until 3) {
47 |       println("Iteration " + i)
48 |       println("===========")
49 |       val startTime = System.nanoTime
50 |       val barr1 = sc.broadcast(arr1)
51 |       val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)
52 |       // Collect the small RDD so we can print the observed sizes locally.
53 |       observedSizes.collect().foreach(i => println(i))
54 |       println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))
55 |     }
56 | 
57 |     sc.stop()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.collection.JavaConversions._
21 | 
22 | /** Prints out environmental information, sleeps, and then exits. Made to
23 |   * test driver submission in the standalone scheduler. */
24 | object DriverSubmissionTest {
25 |   def main(args: Array[String]) {
26 |     if (args.size < 1) {
27 |       println("Usage: DriverSubmissionTest <seconds-to-sleep>")
28 |       System.exit(0)
29 |     }
30 |     val numSecondsToSleep = args(0).toInt
31 | 
32 |     val env = System.getenv()
33 |     val properties = System.getProperties()
34 | 
35 |     println("Environment variables containing SPARK_TEST:")
36 |     env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
37 | 
38 |     println("System properties containing spark.test:")
39 |     properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
40 | 
41 |     for (i <- 1 until numSecondsToSleep) {
42 |       println(s"Alive for $i out of $numSecondsToSleep seconds")
43 |       Thread.sleep(1000)
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/ExceptionHandlingTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | object ExceptionHandlingTest {
23 |   def main(args: Array[String]) {
24 |     val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")
25 |     val sc = new SparkContext(sparkConf)
26 |     sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
27 |       if (math.random > 0.75) {
28 |         throw new Exception("Testing exception handling")
29 |       }
30 |     }
31 | 
32 |     sc.stop()
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object GroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 |       for (i <- 0 until numKVPairs) {
42 |         val byteArr = new Array[Byte](valSize)
43 |         ranGen.nextBytes(byteArr)
44 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
45 |       }
46 |       arr1
47 |     }.cache
48 |     // Enforce that everything has been calculated and in cache
49 |     pairs1.count
50 | 
51 |     println(pairs1.groupByKey(numReducers).count)
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.hadoop.hbase.client.HBaseAdmin
21 | import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
22 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
23 | 
24 | import org.apache.spark._
25 | import org.apache.spark.rdd.NewHadoopRDD
26 | 
27 | object HBaseTest {
28 |   def main(args: Array[String]) {
29 |     val sparkConf = new SparkConf().setAppName("HBaseTest")
30 |     val sc = new SparkContext(sparkConf)
31 |     val conf = HBaseConfiguration.create()
32 |     // Other options for configuring scan behavior are available. More information available at
33 |     // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
34 |     conf.set(TableInputFormat.INPUT_TABLE, args(1))
35 | 
36 |     // Initialize hBase table if necessary
37 |     val admin = new HBaseAdmin(conf)
38 |     if(!admin.isTableAvailable(args(1))) {
39 |       val tableDesc = new HTableDescriptor(args(1))
40 |       admin.createTable(tableDesc)
41 |     }
42 | 
43 |     val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
44 |       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
45 |       classOf[org.apache.hadoop.hbase.client.Result])
46 | 
47 |     hBaseRDD.count()
48 | 
49 |     sc.stop()
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark._
21 | 
22 | object HdfsTest {
23 |   def main(args: Array[String]) {
24 |     val sparkConf = new SparkConf().setAppName("HdfsTest")
25 |     val sc = new SparkContext(sparkConf)
26 |     val file = sc.textFile(args(1))
27 |     val mapped = file.map(s => s.length).cache()
28 |     for (iter <- 1 to 10) {
29 |       val start = System.currentTimeMillis()
30 |       for (x <- mapped) { x + 2 }
31 |       //  println("Processing: " + x)
32 |       val end = System.currentTimeMillis()
33 |       println("Iteration " + iter + " took " + (end-start) + " ms")
34 |     }
35 |     sc.stop()
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | object LocalFileLR {
25 |   val D = 10   // Numer of dimensions
26 |   val rand = new Random(42)
27 | 
28 |   case class DataPoint(x: Vector[Double], y: Double)
29 | 
30 |   def parsePoint(line: String): DataPoint = {
31 |     val nums = line.split(' ').map(_.toDouble)
32 |     DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
33 |   }
34 | 
35 |   def main(args: Array[String]) {
36 |     val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
37 |     val points = lines.map(parsePoint _)
38 |     val ITERATIONS = args(1).toInt
39 | 
40 |     // Initialize w to a random value
41 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
42 |     println("Initial w: " + w)
43 | 
44 |     for (i <- 1 to ITERATIONS) {
45 |       println("On iteration " + i)
46 |       var gradient = DenseVector.zeros[Double](D)
47 |       for (p <- points) {
48 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
49 |         gradient += p.x * scale
50 |       }
51 |       w -= gradient
52 |     }
53 | 
54 |     println("Final w: " + w)
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | /**
25 |  * Logistic regression based classification.
26 |  */
27 | object LocalLR {
28 |   val N = 10000  // Number of data points
29 |   val D = 10   // Number of dimensions
30 |   val R = 0.7  // Scaling factor
31 |   val ITERATIONS = 5
32 |   val rand = new Random(42)
33 | 
34 |   case class DataPoint(x: Vector[Double], y: Double)
35 | 
36 |   def generateData = {
37 |     def generatePoint(i: Int) = {
38 |       val y = if(i % 2 == 0) -1 else 1
39 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
40 |       DataPoint(x, y)
41 |     }
42 |     Array.tabulate(N)(generatePoint)
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     val data = generateData
47 | 
48 |     // Initialize w to a random value
49 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
50 |     println("Initial w: " + w)
51 | 
52 |     for (i <- 1 to ITERATIONS) {
53 |       println("On iteration " + i)
54 |       var gradient = DenseVector.zeros[Double](D)
55 |       for (p <- data) {
56 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
57 |         gradient +=  p.x * scale
58 |       }
59 |       w -= gradient
60 |     }
61 | 
62 |     println("Final w: " + w)
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.SparkContext._
24 | 
25 | object LocalPi {
26 |   def main(args: Array[String]) {
27 |     var count = 0
28 |     for (i <- 1 to 100000) {
29 |       val x = random * 2 - 1
30 |       val y = random * 2 - 1
31 |       if (x*x + y*y < 1) count += 1
32 |     }
33 |     println("Pi is roughly " + 4 * count / 100000.0)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |   * Usage: MultiBroadcastTest [slices] [numElem]
25 |   */
26 | object MultiBroadcastTest {
27 |   def main(args: Array[String]) {
28 | 
29 |     val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")
30 |     val sc = new SparkContext(sparkConf)
31 | 
32 |     val slices = if (args.length > 0) args(0).toInt else 2
33 |     val num = if (args.length > 1) args(1).toInt else 1000000
34 | 
35 |     val arr1 = new Array[Int](num)
36 |     for (i <- 0 until arr1.length) {
37 |       arr1(i) = i
38 |     }
39 | 
40 |     val arr2 = new Array[Int](num)
41 |     for (i <- 0 until arr2.length) {
42 |       arr2(i) = i
43 |     }
44 | 
45 |     val barr1 = sc.broadcast(arr1)
46 |     val barr2 = sc.broadcast(arr2)
47 |     val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>
48 |       (barr1.value.size, barr2.value.size)
49 |     }
50 |     // Collect the small RDD so we can print the observed sizes locally.
51 |     observedSizes.collect().foreach(i => println(i))
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object SkewedGroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 | 
41 |       // map output sizes lineraly increase from the 1st to the last
42 |       numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt
43 | 
44 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
45 |       for (i <- 0 until numKVPairs) {
46 |         val byteArr = new Array[Byte](valSize)
47 |         ranGen.nextBytes(byteArr)
48 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
49 |       }
50 |       arr1
51 |     }.cache()
52 |     // Enforce that everything has been calculated and in cache
53 |     pairs1.count()
54 | 
55 |     println(pairs1.groupByKey(numReducers).count())
56 | 
57 |     sc.stop()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | 
28 | /**
29 |  * Logistic regression based classification.
30 |  * Usage: SparkLR [slices]
31 |  */
32 | object SparkLR {
33 |   val N = 10000  // Number of data points
34 |   val D = 10   // Numer of dimensions
35 |   val R = 0.7  // Scaling factor
36 |   val ITERATIONS = 5
37 |   val rand = new Random(42)
38 | 
39 |   case class DataPoint(x: Vector[Double], y: Double)
40 | 
41 |   def generateData = {
42 |     def generatePoint(i: Int) = {
43 |       val y = if(i % 2 == 0) -1 else 1
44 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
45 |       DataPoint(x, y)
46 |     }
47 |     Array.tabulate(N)(generatePoint)
48 |   }
49 | 
50 |   def main(args: Array[String]) {
51 |     val sparkConf = new SparkConf().setAppName("SparkLR")
52 |     val sc = new SparkContext(sparkConf)
53 |     val numSlices = if (args.length > 0) args(0).toInt else 2
54 |     val points = sc.parallelize(generateData, numSlices).cache()
55 | 
56 |     // Initialize w to a random value
57 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
58 |     println("Initial w: " + w)
59 | 
60 |     for (i <- 1 to ITERATIONS) {
61 |       println("On iteration " + i)
62 |       val gradient = points.map { p =>
63 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
64 |       }.reduce(_ + _)
65 |       w -= gradient
66 |     }
67 | 
68 |     println("Final w: " + w)
69 |     sc.stop()
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |  * Computes the PageRank of URLs from an input file. Input file should
25 |  * be in format of:
26 |  * URL         neighbor URL
27 |  * URL         neighbor URL
28 |  * URL         neighbor URL
29 |  * ...
30 |  * where URL and their neighbors are separated by space(s).
31 |  */
32 | object SparkPageRank {
33 |   def main(args: Array[String]) {
34 |     val sparkConf = new SparkConf().setAppName("PageRank")
35 |     var iters = args(1).toInt
36 |     val ctx = new SparkContext(sparkConf)
37 |     val lines = ctx.textFile(args(0), 1)
38 |     val links = lines.map{ s =>
39 |       val parts = s.split("\\s+")
40 |       (parts(0), parts(1))
41 |     }.distinct().groupByKey().cache()
42 |     var ranks = links.mapValues(v => 1.0)
43 | 
44 |     for (i <- 1 to iters) {
45 |       val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
46 |         val size = urls.size
47 |         urls.map(url => (url, rank / size))
48 |       }
49 |       ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
50 |     }
51 | 
52 |     val output = ranks.collect()
53 |     output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
54 | 
55 |     ctx.stop()
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | 
24 | /** Computes an approximation to pi */
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val conf = new SparkConf().setAppName("Spark Pi")
28 |     val spark = new SparkContext(conf)
29 |     val slices = if (args.length > 0) args(0).toInt else 2
30 |     val n = 100000 * slices
31 |     val count = spark.parallelize(1 to n, slices).map { i =>
32 |       val x = random * 2 - 1
33 |       val y = random * 2 - 1
34 |       if (x*x + y*y < 1) 1 else 0
35 |     }.reduce(_ + _)
36 |     println("Pi is roughly " + 4.0 * count / n)
37 |     spark.stop()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.util.Random
21 | import scala.collection.mutable
22 | 
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | import org.apache.spark.SparkContext._
25 | 
26 | /**
27 |  * Transitive closure on a graph.
28 |  */
29 | object SparkTC {
30 |   val numEdges = 200
31 |   val numVertices = 100
32 |   val rand = new Random(42)
33 | 
34 |   def generateGraph = {
35 |     val edges: mutable.Set[(Int, Int)] = mutable.Set.empty
36 |     while (edges.size < numEdges) {
37 |       val from = rand.nextInt(numVertices)
38 |       val to = rand.nextInt(numVertices)
39 |       if (from != to) edges.+=((from, to))
40 |     }
41 |     edges.toSeq
42 |   }
43 | 
44 |   def main(args: Array[String]) {
45 |     val sparkConf = new SparkConf().setAppName("SparkTC")
46 |     val spark = new SparkContext(sparkConf)
47 |     val slices = if (args.length > 0) args(0).toInt else 2
48 |     var tc = spark.parallelize(generateGraph, slices).cache()
49 | 
50 |     // Linear transitive closure: each round grows paths by one edge,
51 |     // by joining the graph's edges with the already-discovered paths.
52 |     // e.g. join the path (y, z) from the TC with the edge (x, y) from
53 |     // the graph to obtain the path (x, z).
54 | 
55 |     // Because join() joins on keys, the edges are stored in reversed order.
56 |     val edges = tc.map(x => (x._2, x._1))
57 | 
58 |     // This join is iterated until a fixed point is reached.
59 |     var oldCount = 0L
60 |     var nextCount = tc.count()
61 |     do {
62 |       oldCount = nextCount
63 |       // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
64 |       // then project the result to obtain the new (x, z) paths.
65 |       tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()
66 |       nextCount = tc.count()
67 |     } while (nextCount != oldCount)
68 | 
69 |     println("TC has " + tc.count() + " edges.")
70 |     spark.stop()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.storage.StorageLevel
24 | 
25 | /**
26 |  *  Computes an approximation to pi
27 |  *  This example uses Tachyon to persist rdds during computation.
28 |  */
29 | object SparkTachyonPi {
30 |   def main(args: Array[String]) {
31 |     val sparkConf = new SparkConf().setAppName("SparkTachyonPi")
32 |     val spark = new SparkContext(sparkConf)
33 | 
34 |     val slices = if (args.length > 0) args(0).toInt else 2
35 |     val n = 100000 * slices
36 | 
37 |     val rdd = spark.parallelize(1 to n, slices)
38 |     rdd.persist(StorageLevel.OFF_HEAP)
39 |     val count = rdd.map { i =>
40 |       val x = random * 2 - 1
41 |       val y = random * 2 - 1
42 |       if (x * x + y * y < 1) 1 else 0
43 |     }.reduce(_ + _)
44 |     println("Pi is roughly " + 4.0 * count / n)
45 | 
46 |     spark.stop()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.graphx
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark._
22 | import org.apache.spark.graphx._
23 | import org.apache.spark.graphx.lib.Analytics
24 | 
25 | /**
26 |  * Uses GraphX to run PageRank on a LiveJournal social network graph. Download the dataset from
27 |  * http://snap.stanford.edu/data/soc-LiveJournal1.html.
28 |  */
29 | object LiveJournalPageRank {
30 |   def main(args: Array[String]) {
31 |     if (args.length < 1) {
32 |       System.err.println(
33 |         "Usage: LiveJournalPageRank <edge_list_file>\n" +
34 |           "    [--tol=<tolerance>]\n" +
35 |           "        The tolerance allowed at convergence (smaller => more accurate). Default is " +
36 |           "0.001.\n" +
37 |           "    [--output=<output_file>]\n" +
38 |           "        If specified, the file to write the ranks to.\n" +
39 |           "    [--numEPart=<num_edge_partitions>]\n" +
40 |           "        The number of partitions for the graph's edge RDD. Default is 4.\n" +
41 |           "    [--partStrategy=RandomVertexCut | EdgePartition1D | EdgePartition2D | " +
42 |           "CanonicalRandomVertexCut]\n" +
43 |           "        The way edges are assigned to edge partitions. Default is RandomVertexCut.")
44 |       System.exit(-1)
45 |     }
46 | 
47 |     Analytics.main(args.patch(0, List("pagerank"), 0))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.mllib
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
22 | import org.apache.spark.mllib.linalg.Vectors
23 | 
24 | /**
25 |  * Compute the principal components of a tall-and-skinny matrix, whose rows are observations.
26 |  *
27 |  * The input matrix must be stored in row-oriented dense format, one line per row with its entries
28 |  * separated by space. For example,
29 |  * {{{
30 |  * 0.5 1.0
31 |  * 2.0 3.0
32 |  * 4.0 5.0
33 |  * }}}
34 |  * represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
35 |  */
36 | object TallSkinnyPCA {
37 |   def main(args: Array[String]) {
38 |     if (args.length != 1) {
39 |       System.err.println("Usage: TallSkinnyPCA <input>")
40 |       System.exit(1)
41 |     }
42 | 
43 |     val conf = new SparkConf().setAppName("TallSkinnyPCA")
44 |     val sc = new SparkContext(conf)
45 | 
46 |     // Load and parse the data file.
47 |     val rows = sc.textFile(args(0)).map { line =>
48 |       val values = line.split(' ').map(_.toDouble)
49 |       Vectors.dense(values)
50 |     }
51 |     val mat = new RowMatrix(rows)
52 | 
53 |     // Compute principal components.
54 |     val pc = mat.computePrincipalComponents(mat.numCols().toInt)
55 | 
56 |     println("Principal components are:\n" + pc)
57 | 
58 |     sc.stop()
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.mllib
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
22 | import org.apache.spark.mllib.linalg.Vectors
23 | 
24 | /**
25 |  * Compute the singular value decomposition (SVD) of a tall-and-skinny matrix.
26 |  *
27 |  * The input matrix must be stored in row-oriented dense format, one line per row with its entries
28 |  * separated by space. For example,
29 |  * {{{
30 |  * 0.5 1.0
31 |  * 2.0 3.0
32 |  * 4.0 5.0
33 |  * }}}
34 |  * represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
35 |  */
36 | object TallSkinnySVD {
37 |   def main(args: Array[String]) {
38 |     if (args.length != 1) {
39 |       System.err.println("Usage: TallSkinnySVD <input>")
40 |       System.exit(1)
41 |     }
42 | 
43 |     val conf = new SparkConf().setAppName("TallSkinnySVD")
44 |     val sc = new SparkContext(conf)
45 | 
46 |     // Load and parse the data file.
47 |     val rows = sc.textFile(args(0)).map { line =>
48 |       val values = line.split(' ').map(_.toDouble)
49 |       Vectors.dense(values)
50 |     }
51 |     val mat = new RowMatrix(rows)
52 | 
53 |     // Compute SVD.
54 |     val svd = mat.computeSVD(mat.numCols().toInt)
55 | 
56 |     println("Singular values are " + svd.s)
57 | 
58 |     sc.stop()
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.streaming
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.storage.StorageLevel
22 | import org.apache.spark.streaming._
23 | import org.apache.spark.streaming.flume._
24 | import org.apache.spark.util.IntParam
25 | 
26 | /**
27 |  *  Produces a count of events received from Flume.
28 |  *
29 |  *  This should be used in conjunction with an AvroSink in Flume. It will start
30 |  *  an Avro server on at the request host:port address and listen for requests.
31 |  *  Your Flume AvroSink should be pointed to this address.
32 |  *
33 |  *  Usage: FlumeEventCount <host> <port>
34 |  *    <host> is the host the Flume receiver will be started on - a receiver
35 |  *           creates a server and listens for flume events.
36 |  *    <port> is the port the Flume receiver will listen on.
37 |  *
38 |  *  To run this example:
39 |  *    `$ bin/run-example org.apache.spark.examples.streaming.FlumeEventCount <host> <port> `
40 |  */
41 | object FlumeEventCount {
42 |   def main(args: Array[String]) {
43 |     if (args.length < 2) {
44 |       System.err.println(
45 |         "Usage: FlumeEventCount <host> <port>")
46 |       System.exit(1)
47 |     }
48 | 
49 |     StreamingExamples.setStreamingLogLevels()
50 | 
51 |     val Array(host, IntParam(port)) = args
52 | 
53 |     val batchInterval = Milliseconds(2000)
54 | 
55 |     // Create the context and set the batch size
56 |     val sparkConf = new SparkConf().setAppName("FlumeEventCount")
57 |     val ssc = new StreamingContext(sparkConf, batchInterval)
58 | 
59 |     // Create a flume stream
60 |     val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2)
61 | 
62 |     // Print out the count of events received from this server in each batch
63 |     stream.count().map(cnt => "Received " + cnt + " flume events." ).print()
64 | 
65 |     ssc.start()
66 |     ssc.awaitTermination()
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.streaming
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
22 | import org.apache.spark.streaming.StreamingContext._
23 | 
24 | /**
25 |  * Counts words in new text files created in the given directory
26 |  * Usage: HdfsWordCount <directory>
27 |  *   <directory> is the directory that Spark Streaming will use to find and read new text files.
28 |  *
29 |  * To run this on your local machine on directory `localdir`, run this example
30 |  *    $ bin/run-example \
31 |  *       org.apache.spark.examples.streaming.HdfsWordCount localdir
32 |  *
33 |  * Then create a text file in `localdir` and the words in the file will get counted.
34 |  */
35 | object HdfsWordCount {
36 |   def main(args: Array[String]) {
37 |     if (args.length < 1) {
38 |       System.err.println("Usage: HdfsWordCount <directory>")
39 |       System.exit(1)
40 |     }
41 | 
42 |     StreamingExamples.setStreamingLogLevels()
43 |     val sparkConf = new SparkConf().setAppName("HdfsWordCount")
44 |     // Create the context
45 |     val ssc = new StreamingContext(sparkConf, Seconds(2))
46 | 
47 |     // Create the FileInputDStream on the directory and use the
48 |     // stream to count words in new files created
49 |     val lines = ssc.textFileStream(args(0))
50 |     val words = lines.flatMap(_.split(" "))
51 |     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
52 |     wordCounts.print()
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.streaming
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
22 | import org.apache.spark.streaming.StreamingContext._
23 | import org.apache.spark.storage.StorageLevel
24 | 
25 | /**
26 |  * Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
27 |  *
28 |  * Usage: NetworkWordCount <hostname> <port>
29 |  * <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
30 |  *
31 |  * To run this on your local machine, you need to first run a Netcat server
32 |  *    `$ nc -lk 9999`
33 |  * and then run the example
34 |  *    `$ bin/run-example org.apache.spark.examples.streaming.NetworkWordCount localhost 9999`
35 |  */
36 | object NetworkWordCount {
37 |   def main(args: Array[String]) {
38 |     if (args.length < 2) {
39 |       System.err.println("Usage: NetworkWordCount <hostname> <port>")
40 |       System.exit(1)
41 |     }
42 | 
43 |     StreamingExamples.setStreamingLogLevels()
44 | 
45 |     // Create the context with a 1 second batch size
46 |     val sparkConf = new SparkConf().setAppName("NetworkWordCount")
47 |     val ssc = new StreamingContext(sparkConf, Seconds(1))
48 | 
49 |     // Create a socket stream on target ip:port and count the
50 |     // words in input stream of \n delimited text (eg. generated by 'nc')
51 |     // Note that no duplication in storage level only for running locally.
52 |     // Replication necessary in distributed scenario for fault tolerance.
53 |     val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)
54 |     val words = lines.flatMap(_.split(" "))
55 |     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
56 |     wordCounts.print()
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.streaming
19 | 
20 | import scala.collection.mutable.SynchronizedQueue
21 | 
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.rdd.RDD
24 | import org.apache.spark.streaming.{Seconds, StreamingContext}
25 | import org.apache.spark.streaming.StreamingContext._
26 | 
27 | object QueueStream {
28 | 
29 |   def main(args: Array[String]) {
30 | 
31 |     StreamingExamples.setStreamingLogLevels()
32 |     val sparkConf = new SparkConf().setAppName("QueueStream")
33 |     // Create the context
34 |     val ssc = new StreamingContext(sparkConf, Seconds(1))
35 | 
36 |     // Create the queue through which RDDs can be pushed to
37 |     // a QueueInputDStream
38 |     val rddQueue = new SynchronizedQueue[RDD[Int]]()
39 | 
40 |     // Create the QueueInputDStream and use it do some processing
41 |     val inputStream = ssc.queueStream(rddQueue)
42 |     val mappedStream = inputStream.map(x => (x % 10, 1))
43 |     val reducedStream = mappedStream.reduceByKey(_ + _)
44 |     reducedStream.print()
45 |     ssc.start()
46 | 
47 |     // Create and push some RDDs into
48 |     for (i <- 1 to 30) {
49 |       rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10)
50 |       Thread.sleep(1000)
51 |     }
52 |     ssc.stop()
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples.streaming
19 | 
20 | import org.apache.spark.Logging
21 | 
22 | import org.apache.log4j.{Level, Logger}
23 | 
24 | /** Utility functions for Spark Streaming examples. */
25 | object StreamingExamples extends Logging {
26 | 
27 |   /** Set reasonable logging levels for streaming if the user has not configured log4j. */
28 |   def setStreamingLogLevels() {
29 |     val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
30 |     if (!log4jInitialized) {
31 |       // We first log something to initialize Spark's default logging, then we override the
32 |       // logging level.
33 |       logInfo("Setting log level to [WARN] for streaming example." +
34 |         " To override add a custom log4j.properties to the classpath.")
35 |       Logger.getRootLogger.setLevel(Level.WARN)
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.1


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.2


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | docs/
3 | pyspark.egg-info
4 | build/
5 | dist/
6 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/epydoc.conf:
--------------------------------------------------------------------------------
 1 | [epydoc] # Epydoc section marker (required by ConfigParser)
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Information about the project.
21 | name: Spark 1.0.0 Python API Docs
22 | url: http://spark.apache.org
23 | 
24 | # The list of modules to document.  Modules can be named using
25 | # dotted names, module filenames, or package directory names.
26 | # This option may be repeated.
27 | modules: pyspark
28 | 
29 | # Write html output to the directory "apidocs"
30 | output: html
31 | target: docs/
32 | 
33 | private: no
34 | 
35 | exclude: pyspark.cloudpickle pyspark.worker pyspark.join
36 |          pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests
37 |          pyspark.rddsampler pyspark.daemon pyspark.mllib._common
38 |          pyspark.mllib.tests
39 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/lib/PY4J_LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | - Redistributions of source code must retain the above copyright notice, this
 8 | list of conditions and the following disclaimer.
 9 | 
10 | - Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | - The name of the author may not be used to endorse or promote products
15 | derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 | POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/lib/py4j-0.8.1-src.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/python/lib/py4j-0.8.1-src.zip


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | PySpark is the Python API for Spark.
20 | 
21 | Public classes:
22 | 
23 |   - L{SparkContext<pyspark.context.SparkContext>}
24 |       Main entry point for Spark functionality.
25 |   - L{RDD<pyspark.rdd.RDD>}
26 |       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
27 |   - L{Broadcast<pyspark.broadcast.Broadcast>}
28 |       A broadcast variable that gets reused across tasks.
29 |   - L{Accumulator<pyspark.accumulators.Accumulator>}
30 |       An "add-only" shared variable that tasks can only add values to.
31 |   - L{SparkConf<pyspark.conf.SparkConf>}
32 |       For configuring Spark.
33 |   - L{SparkFiles<pyspark.files.SparkFiles>}
34 |       Access files shipped with jobs.
35 |   - L{StorageLevel<pyspark.storagelevel.StorageLevel>}
36 |       Finer-grained cache persistence levels.
37 | 
38 | Spark SQL:
39 |   - L{SQLContext<pyspark.sql.SQLContext>}
40 |       Main entry point for SQL functionality.
41 |   - L{SchemaRDD<pyspark.sql.SchemaRDD>}
42 |       A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
43 |       addition to normal RDD operations, SchemaRDDs also support SQL.
44 |   - L{Row<pyspark.sql.Row>}
45 |       A Row of data returned by a Spark SQL query.
46 | 
47 | Hive:
48 |   - L{HiveContext<pyspark.context.HiveContext>}
49 |       Main entry point for accessing data stored in Apache Hive..
50 | """
51 | 
52 | from pyspark.conf import SparkConf
53 | from pyspark.context import SparkContext
54 | from pyspark.sql import SQLContext
55 | from pyspark.rdd import RDD
56 | from pyspark.sql import SchemaRDD
57 | from pyspark.sql import Row
58 | from pyspark.files import SparkFiles
59 | from pyspark.storagelevel import StorageLevel
60 | 
61 | 
62 | __all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD", "SparkFiles", "StorageLevel", "Row"]
63 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/broadcast.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | >>> from pyspark.context import SparkContext
20 | >>> sc = SparkContext('local', 'test')
21 | >>> b = sc.broadcast([1, 2, 3, 4, 5])
22 | >>> b.value
23 | [1, 2, 3, 4, 5]
24 | 
25 | >>> from pyspark.broadcast import _broadcastRegistry
26 | >>> _broadcastRegistry[b.bid] = b
27 | >>> from cPickle import dumps, loads
28 | >>> loads(dumps(b)).value
29 | [1, 2, 3, 4, 5]
30 | 
31 | >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
32 | [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
33 | 
34 | >>> large_broadcast = sc.broadcast(list(range(10000)))
35 | """
36 | # Holds broadcasted data received from Java, keyed by its id.
37 | _broadcastRegistry = {}
38 | 
39 | 
40 | def _from_id(bid):
41 |     from pyspark.broadcast import _broadcastRegistry
42 |     if bid not in _broadcastRegistry:
43 |         raise Exception("Broadcast variable '%s' not loaded!" % bid)
44 |     return _broadcastRegistry[bid]
45 | 
46 | 
47 | class Broadcast(object):
48 |     """
49 |     A broadcast variable created with
50 |     L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>}.
51 |     Access its value through C{.value}.
52 |     """
53 | 
54 |     def __init__(self, bid, value, java_broadcast=None, pickle_registry=None):
55 |         """
56 |         Should not be called directly by users -- use
57 |         L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>}
58 |         instead.
59 |         """
60 |         self.value = value
61 |         self.bid = bid
62 |         self._jbroadcast = java_broadcast
63 |         self._pickle_registry = pickle_registry
64 | 
65 |     def __reduce__(self):
66 |         self._pickle_registry.add(self)
67 |         return (_from_id, (self.bid, ))
68 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/files.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import os
19 | 
20 | 
21 | class SparkFiles(object):
22 |     """
23 |     Resolves paths to files added through
24 |     L{SparkContext.addFile()<pyspark.context.SparkContext.addFile>}.
25 | 
26 |     SparkFiles contains only classmethods; users should not create SparkFiles
27 |     instances.
28 |     """
29 | 
30 |     _root_directory = None
31 |     _is_running_on_worker = False
32 |     _sc = None
33 | 
34 |     def __init__(self):
35 |         raise NotImplementedError("Do not construct SparkFiles objects")
36 | 
37 |     @classmethod
38 |     def get(cls, filename):
39 |         """
40 |         Get the absolute path of a file added through C{SparkContext.addFile()}.
41 |         """
42 |         path = os.path.join(SparkFiles.getRootDirectory(), filename)
43 |         return os.path.abspath(path)
44 | 
45 |     @classmethod
46 |     def getRootDirectory(cls):
47 |         """
48 |         Get the root directory that contains files added through
49 |         C{SparkContext.addFile()}.
50 |         """
51 |         if cls._is_running_on_worker:
52 |             return cls._root_directory
53 |         else:
54 |             # This will have to change if we support multiple SparkContexts:
55 |             return cls._sc._jvm.org.apache.spark.SparkFiles.getRootDirectory()
56 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/mllib/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | Python bindings for MLlib.
20 | """
21 | 
22 | # MLlib currently needs and NumPy 1.4+, so complain if lower
23 | 
24 | import numpy
25 | if numpy.version.version < '1.4':
26 |     raise Exception("MLlib requires NumPy 1.4+")
27 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/resultiterable.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | __all__ = ["ResultIterable"]
19 | 
20 | import collections
21 | 
22 | class ResultIterable(collections.Iterable):
23 |     """
24 |     A special result iterable. This is used because the standard iterator can not be pickled
25 |     """
26 |     def __init__(self, data):
27 |         self.data = data
28 |         self.index = 0
29 |         self.maxindex = len(data)
30 |     def __iter__(self):
31 |         return iter(self.data)
32 |     def __len__(self):
33 |         return len(self.data)
34 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/shell.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | An interactive shell.
20 | 
21 | This file is designed to be launched as a PYTHONSTARTUP script.
22 | """
23 | 
24 | import sys
25 | if sys.version_info[0] != 2:
26 |     print("Error: Default Python used is Python%s" % sys.version_info.major)
27 |     print("\tSet env variable PYSPARK_PYTHON to Python2 binary and re-run it.")
28 |     sys.exit(1)
29 | 
30 | 
31 | import os
32 | import platform
33 | import pyspark
34 | from pyspark.context import SparkContext
35 | from pyspark.storagelevel import StorageLevel
36 | 
37 | # this is the equivalent of ADD_JARS
38 | add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("ADD_FILES") != None else None
39 | 
40 | if os.environ.get("SPARK_EXECUTOR_URI"):
41 |     SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
42 | 
43 | sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
44 | 
45 | print("""Welcome to
46 |       ____              __
47 |      / __/__  ___ _____/ /__
48 |     _\ \/ _ \/ _ `/ __/  '_/
49 |    /__ / .__/\_,_/_/ /_/\_\   version 1.0.0
50 |       /_/
51 | """)
52 | print("Using Python version %s (%s, %s)" % (
53 |     platform.python_version(),
54 |     platform.python_build()[0],
55 |     platform.python_build()[1]))
56 | print("SparkContext available as sc.")
57 | 
58 | if add_files != None:
59 |     print("Adding files: [%s]" % ", ".join(add_files))
60 | 
61 | # The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP,
62 | # which allows us to execute the user's PYTHONSTARTUP file:
63 | _pythonstartup = os.environ.get('OLD_PYTHONSTARTUP')
64 | if _pythonstartup and os.path.isfile(_pythonstartup):
65 |     execfile(_pythonstartup)
66 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/pyspark/storagelevel.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | __all__ = ["StorageLevel"]
19 | 
20 | class StorageLevel:
21 |     """
22 |     Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
23 |     whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
24 |     in a serialized format, and whether to replicate the RDD partitions on multiple nodes.
25 |     Also contains static constants for some commonly used storage levels, such as MEMORY_ONLY.
26 |     """
27 | 
28 |     def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication = 1):
29 |         self.useDisk = useDisk
30 |         self.useMemory = useMemory
31 |         self.useOffHeap = useOffHeap
32 |         self.deserialized = deserialized
33 |         self.replication = replication
34 | 
35 |     def __repr__(self):
36 |         return "StorageLevel(%s, %s, %s, %s, %s)" % (
37 |             self.useDisk, self.useMemory, self.useOffHeap, self.deserialized, self.replication)
38 | 
39 | StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False)
40 | StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
41 | StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, True)
42 | StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, True, 2)
43 | StorageLevel.MEMORY_ONLY_SER = StorageLevel(False, True, False, False)
44 | StorageLevel.MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, False, 2)
45 | StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, True)
46 | StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, True, 2)
47 | StorageLevel.MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False)
48 | StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2)
49 | StorageLevel.OFF_HEAP = StorageLevel(False, False, True, False, 1)


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/run-tests:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | 
21 | # Figure out where the Spark framework is installed
22 | FWDIR="$(cd `dirname $0`; cd ../; pwd)"
23 | 
24 | # CD into the python directory to find things on the right path
25 | cd "$FWDIR/python"
26 | 
27 | FAILED=0
28 | 
29 | rm -f unit-tests.log
30 | 
31 | # Remove the metastore and warehouse directory created by the HiveContext tests in SparkSQL
32 | rm -rf metastore warehouse
33 | 
34 | function run_test() {
35 |     SPARK_TESTING=0 $FWDIR/bin/pyspark $1 2>&1 | tee -a > unit-tests.log
36 |     FAILED=$((PIPESTATUS[0]||$FAILED))
37 | 
38 |     # Fail and exit on the first test failure.
39 |     if [[ $FAILED != 0 ]]; then
40 |         cat unit-tests.log | grep -v "^[0-9][0-9]*" # filter all lines starting with a number.
41 |         echo -en "\033[31m"  # Red
42 |         echo "Had test failures; see logs."
43 |         echo -en "\033[0m"  # No color
44 |         exit -1
45 |     fi
46 | 
47 | }
48 | 
49 | run_test "pyspark/rdd.py"
50 | run_test "pyspark/context.py"
51 | run_test "pyspark/conf.py"
52 | if [ -n "$_RUN_SQL_TESTS" ]; then
53 |   run_test "pyspark/sql.py"
54 | fi
55 | run_test "-m doctest pyspark/broadcast.py"
56 | run_test "-m doctest pyspark/accumulators.py"
57 | run_test "-m doctest pyspark/serializers.py"
58 | run_test "pyspark/tests.py"
59 | run_test "pyspark/mllib/_common.py"
60 | run_test "pyspark/mllib/classification.py"
61 | run_test "pyspark/mllib/clustering.py"
62 | run_test "pyspark/mllib/linalg.py"
63 | run_test "pyspark/mllib/recommendation.py"
64 | run_test "pyspark/mllib/regression.py"
65 | run_test "pyspark/mllib/tests.py"
66 | 
67 | if [[ $FAILED == 0 ]]; then
68 |     echo -en "\033[32m"  # Green
69 |     echo "Tests passed."
70 |     echo -en "\033[0m"  # No color
71 | fi
72 | 
73 | # TODO: in the long-run, it would be nice to use a test runner like `nose`.
74 | # The doctest fixtures are the current barrier to doing this.
75 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/test_support/hello.txt:
--------------------------------------------------------------------------------
1 | Hello World!
2 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlib-0.1-py2.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlib-0.1-py2.7.egg


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlibrary.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | Used to test shipping of code depenencies with SparkContext.addPyFile().
20 | """
21 | 
22 | class UserClass(object):
23 |     def hello(self):
24 |         return "Hello World!"
25 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/slaves.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Run a shell command on all slave hosts.
21 | #
22 | # Environment Variables
23 | #
24 | #   SPARK_SLAVES    File naming remote hosts.
25 | #     Default is ${SPARK_CONF_DIR}/slaves.
26 | #   SPARK_CONF_DIR  Alternate conf dir. Default is ${SPARK_HOME}/conf.
27 | #   SPARK_SLAVE_SLEEP Seconds to sleep between spawning remote commands.
28 | #   SPARK_SSH_OPTS Options passed to ssh when running remote commands.
29 | ##
30 | 
31 | usage="Usage: slaves.sh [--config <conf-dir>] command..."
32 | 
33 | # if no args specified, show usage
34 | if [ $# -le 0 ]; then
35 |   echo $usage
36 |   exit 1
37 | fi
38 | 
39 | sbin=`dirname "$0"`
40 | sbin=`cd "$sbin"; pwd`
41 | 
42 | . "$sbin/spark-config.sh"
43 | 
44 | # If the slaves file is specified in the command line,
45 | # then it takes precedence over the definition in
46 | # spark-env.sh. Save it here.
47 | HOSTLIST=$SPARK_SLAVES
48 | 
49 | # Check if --config is passed as an argument. It is an optional parameter.
50 | # Exit if the argument is not a directory.
51 | if [ "$1" == "--config" ]
52 | then
53 |   shift
54 |   conf_dir=$1
55 |   if [ ! -d "$conf_dir" ]
56 |   then
57 |     echo "ERROR : $conf_dir is not a directory"
58 |     echo $usage
59 |     exit 1
60 |   else
61 |     export SPARK_CONF_DIR=$conf_dir
62 |   fi
63 |   shift
64 | fi
65 | 
66 | . "$SPARK_PREFIX/bin/load-spark-env.sh"
67 | 
68 | if [ "$HOSTLIST" = "" ]; then
69 |   if [ "$SPARK_SLAVES" = "" ]; then
70 |     export HOSTLIST="${SPARK_CONF_DIR}/slaves"
71 |   else
72 |     export HOSTLIST="${SPARK_SLAVES}"
73 |   fi
74 | fi
75 | 
76 | # By default disable strict host key checking
77 | if [ "$SPARK_SSH_OPTS" = "" ]; then
78 |   SPARK_SSH_OPTS="-o StrictHostKeyChecking=no"
79 | fi
80 | 
81 | for slave in `cat "$HOSTLIST"|sed  "s/#.*$//;/^$/d"`; do
82 |  ssh $SPARK_SSH_OPTS $slave $"${@// /\\ }" \
83 |    2>&1 | sed "s/^/$slave: /" &
84 |  if [ "$SPARK_SLAVE_SLEEP" != "" ]; then
85 |    sleep $SPARK_SLAVE_SLEEP
86 |  fi
87 | done
88 | 
89 | wait
90 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/spark-config.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # included in all the spark scripts with source command
19 | # should not be executable directly
20 | # also should not be passed any arguments, since we need original $*
21 | 
22 | # resolve links - $0 may be a softlink
23 | this="${BASH_SOURCE-$0}"
24 | common_bin=$(cd -P -- "$(dirname -- "$this")" && pwd -P)
25 | script="$(basename -- "$this")"
26 | this="$common_bin/$script"
27 | 
28 | # convert relative path to absolute path
29 | config_bin=`dirname "$this"`
30 | script=`basename "$this"`
31 | config_bin=`cd "$config_bin"; pwd`
32 | this="$config_bin/$script"
33 | 
34 | export SPARK_PREFIX=`dirname "$this"`/..
35 | export SPARK_HOME=${SPARK_PREFIX}
36 | export SPARK_CONF_DIR="$SPARK_HOME/conf"
37 | # Add the PySpark classes to the PYTHONPATH:
38 | export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
39 | export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
40 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/spark-daemons.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Run a Spark command on all slave hosts.
21 | 
22 | usage="Usage: spark-daemons.sh [--config <conf-dir>] [start|stop] command instance-number args..."
23 | 
24 | # if no args specified, show usage
25 | if [ $# -le 1 ]; then
26 |   echo $usage
27 |   exit 1
28 | fi
29 | 
30 | sbin=`dirname "$0"`
31 | sbin=`cd "$sbin"; pwd`
32 | 
33 | . "$sbin/spark-config.sh"
34 | 
35 | exec "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/spark-daemon.sh" "$@"
36 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/spark-executor:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | FWDIR="$(cd `dirname $0`/..; pwd)"
21 | 
22 | export PYTHONPATH=$FWDIR/python:$PYTHONPATH
23 | export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
24 | 
25 | echo "Running spark-executor with framework dir = $FWDIR"
26 | exec $FWDIR/bin/spark-class org.apache.spark.executor.MesosExecutorBackend
27 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/start-all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Start all spark daemons.
21 | # Starts the master on this node.
22 | # Starts a worker on each node specified in conf/slaves
23 | 
24 | sbin=`dirname "$0"`
25 | sbin=`cd "$sbin"; pwd`
26 | 
27 | TACHYON_STR=""
28 | 
29 | while (( "$#" )); do
30 | case $1 in
31 |     --with-tachyon)
32 |       TACHYON_STR="--with-tachyon"
33 |       ;;
34 |   esac
35 | shift
36 | done
37 | 
38 | # Load the Spark configuration
39 | . "$sbin/spark-config.sh"
40 | 
41 | # Start Master
42 | "$sbin"/start-master.sh $TACHYON_STR
43 | 
44 | # Start Workers
45 | "$sbin"/start-slaves.sh $TACHYON_STR
46 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/start-history-server.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Starts the history server on the machine this script is executed on.
21 | #
22 | # Usage: start-history-server.sh <base-log-dir> [<web-ui-port>]
23 | #   Example: ./start-history-server.sh --dir /tmp/spark-events --port 18080
24 | #
25 | 
26 | sbin=`dirname "$0"`
27 | sbin=`cd "$sbin"; pwd`
28 | 
29 | if [ $# -lt 1 ]; then
30 |   echo "Usage: ./start-history-server.sh <base-log-dir>"
31 |   echo "Example: ./start-history-server.sh /tmp/spark-events"
32 |   exit
33 | fi
34 | 
35 | LOG_DIR=$1
36 | 
37 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 --dir "$LOG_DIR"
38 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/start-master.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Starts the master on the machine this script is executed on.
21 | 
22 | sbin=`dirname "$0"`
23 | sbin=`cd "$sbin"; pwd`
24 | 
25 | START_TACHYON=false
26 | 
27 | while (( "$#" )); do
28 | case $1 in
29 |     --with-tachyon)
30 |       if [ ! -e "$sbin"/../tachyon/bin/tachyon ]; then
31 |         echo "Error: --with-tachyon specified, but tachyon not found."
32 |         exit -1
33 |       fi
34 |       START_TACHYON=true
35 |       ;;
36 |   esac
37 | shift
38 | done
39 | 
40 | . "$sbin/spark-config.sh"
41 | 
42 | . "$SPARK_PREFIX/bin/load-spark-env.sh"
43 | 
44 | if [ "$SPARK_MASTER_PORT" = "" ]; then
45 |   SPARK_MASTER_PORT=7077
46 | fi
47 | 
48 | if [ "$SPARK_MASTER_IP" = "" ]; then
49 |   SPARK_MASTER_IP=`hostname`
50 | fi
51 | 
52 | if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then
53 |   SPARK_MASTER_WEBUI_PORT=8080
54 | fi
55 | 
56 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT
57 | 
58 | if [ "$START_TACHYON" == "true" ]; then
59 |   "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP
60 |   "$sbin"/../tachyon/bin/tachyon format -s
61 |   "$sbin"/../tachyon/bin/tachyon-start.sh master
62 | fi
63 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/start-slave.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Usage: start-slave.sh <worker#> <master-spark-URL>
21 | #   where <master-spark-URL> is like "spark://localhost:7077"
22 | 
23 | sbin=`dirname "$0"`
24 | sbin=`cd "$sbin"; pwd`
25 | 
26 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.worker.Worker "$@"
27 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/start-slaves.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | sbin=`dirname "$0"`
21 | sbin=`cd "$sbin"; pwd`
22 | 
23 | 
24 | START_TACHYON=false
25 | 
26 | while (( "$#" )); do
27 | case $1 in
28 |     --with-tachyon)
29 |       if [ ! -e "$sbin"/../tachyon/bin/tachyon ]; then
30 |         echo "Error: --with-tachyon specified, but tachyon not found."
31 |         exit -1
32 |       fi
33 |       START_TACHYON=true
34 |       ;;
35 |   esac
36 | shift
37 | done
38 | 
39 | . "$sbin/spark-config.sh"
40 | 
41 | . "$SPARK_PREFIX/bin/load-spark-env.sh"
42 | 
43 | # Find the port number for the master
44 | if [ "$SPARK_MASTER_PORT" = "" ]; then
45 |   SPARK_MASTER_PORT=7077
46 | fi
47 | 
48 | if [ "$SPARK_MASTER_IP" = "" ]; then
49 |   SPARK_MASTER_IP=`hostname`
50 | fi
51 | 
52 | if [ "$START_TACHYON" == "true" ]; then
53 |   "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP
54 | 
55 |   # set -t so we can call sudo
56 |   SPARK_SSH_OPTS="-o StrictHostKeyChecking=no -t" "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/../tachyon/bin/tachyon-start.sh" worker SudoMount \; sleep 1
57 | fi
58 | 
59 | # Launch the slaves
60 | if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
61 |   exec "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/start-slave.sh" 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT
62 | else
63 |   if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then
64 |     SPARK_WORKER_WEBUI_PORT=8081
65 |   fi
66 |   for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do
67 |     "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/start-slave.sh" $(( $i + 1 ))  spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT --webui-port $(( $SPARK_WORKER_WEBUI_PORT + $i ))
68 |   done
69 | fi
70 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/stop-all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Start all spark daemons.
21 | # Run this on the master nde
22 | 
23 | 
24 | sbin=`dirname "$0"`
25 | sbin=`cd "$sbin"; pwd`
26 | 
27 | # Load the Spark configuration
28 | . "$sbin/spark-config.sh"
29 | 
30 | # Stop the slaves, then the master
31 | "$sbin"/stop-slaves.sh
32 | "$sbin"/stop-master.sh
33 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/stop-history-server.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Stops the history server on the machine this script is executed on.
21 | 
22 | sbin=`dirname "$0"`
23 | sbin=`cd "$sbin"; pwd`
24 | 
25 | "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.history.HistoryServer 1
26 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/stop-master.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Starts the master on the machine this script is executed on.
21 | 
22 | sbin=`dirname "$0"`
23 | sbin=`cd "$sbin"; pwd`
24 | 
25 | . "$sbin/spark-config.sh"
26 | 
27 | "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.master.Master 1
28 | 
29 | if [ -e "$sbin"/../tachyon/bin/tachyon ]; then
30 |   "$sbin"/../tachyon/bin/tachyon killAll tachyon.master.Master
31 | fi
32 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/files/spark/sbin/stop-slaves.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | sbin=`dirname "$0"`
21 | sbin=`cd "$sbin"; pwd`
22 | 
23 | . "$sbin/spark-config.sh"
24 | 
25 | . "$SPARK_PREFIX/bin/load-spark-env.sh"
26 | 
27 | # do before the below calls as they exec
28 | if [ -e "$sbin"/../tachyon/bin/tachyon ]; then
29 |   "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon killAll tachyon.worker.Worker
30 | fi
31 | 
32 | if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
33 |   "$sbin"/spark-daemons.sh stop org.apache.spark.deploy.worker.Worker 1
34 | else
35 |   for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do
36 |     "$sbin"/spark-daemons.sh stop org.apache.spark.deploy.worker.Worker $(( $i + 1 ))
37 |   done
38 | fi
39 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/manifests/defaults.pp:
--------------------------------------------------------------------------------
1 | class spark::defaults {
2 |     $install_dir     = '/usr/lib/spark'
3 |     $master_port     = 7077
4 |     $web_port        = 8080
5 |     $cores           = undef
6 |     $memory          = undef
7 |     $scratch_dir     = "${install_dir}/work"
8 | }
9 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/manifests/master.pp:
--------------------------------------------------------------------------------
 1 | class spark::master (
 2 |     $spark_service_status = 'running',
 3 |     $master_port = $::spark::defaults::master_port,
 4 |     $web_port    = $::spark::defaults::web_port,
 5 |     $install_dir = $::spark::defaults::install_dir,
 6 |     $worker_mem,
 7 | ) inherits spark::defaults {
 8 | 
 9 |     class {'spark':
10 |         master      => $::fqdn,
11 |         install_dir => $install_dir,
12 |         worker_mem  => $worker_mem,
13 |     }
14 |     Class['spark'] -> Class['spark::master']
15 | 
16 |     # The Upstart service file.
17 |     file {'/etc/init/spark-master.conf':
18 |         content => template('spark/spark-master.conf.erb'),
19 |         mode    => '0644',
20 |         owner   => 'root',
21 |         group   => 'root',
22 |         notify  => Service['spark-master'], 
23 |     } 
24 | 
25 |     file { "${install_dir}/bin/spark-master-runner.sh":
26 |         content => template('spark/spark-master-runner.sh.erb'),
27 |         owner   => 'root',
28 |         group   => 'root',
29 |         mode    => '0744',
30 |     }
31 | 
32 |     # The service that runs the master server. 
33 |     service {'spark-master': 
34 |         ensure   => $spark_service_status, 
35 |         require  => [File['/etc/init/spark-master.conf'], File["${install_dir}/bin/spark-master-runner.sh"]], 
36 |         hasrestart => true,
37 |         hasstatus => true,
38 |         restart => '/sbin/initctl restart spark-master',
39 |         start => '/sbin/initctl start spark-master',
40 |         stop => '/sbin/initctl stop spark-master',
41 |         status => '/sbin/initctl status spark-master | grep "/running" 1>/dev/null 2>&1',
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/manifests/spark.pp:
--------------------------------------------------------------------------------
 1 | class spark (
 2 |     $master,
 3 |     $worker_mem,
 4 |     $install_dir
 5 | ) {
 6 |     require spark::user
 7 | 
 8 | 
 9 |     # Better would be if they had a package repository available, but they do not at this moment.
10 |     # (Nor do I, so this is the cleanest way without package managers).
11 |     file {$install_dir:
12 |         ensure  => directory,
13 |         source  => 'puppet:///modules/spark/spark',
14 |         mode    => '0744',
15 |         recurse => true,
16 |         owner   => 'root',
17 |         group   => 'root',
18 |         require => User['spark'],
19 |     }
20 | 
21 |     
22 |     file {"${install_dir}/conf/spark-env.sh":
23 |         content => template('spark/spark-env.sh.erb'),
24 |         mode    => '0744',
25 |         owner   => 'root',
26 |         group   => 'root',
27 |         require => File[$install_dir],
28 |     } 
29 | 
30 |     #file {"${install_dir}/conf/metrics.properties":
31 |     #    content => template('spark/metrics.properties.erb'),
32 |     #    mode    => '0744',
33 |     #    owner   => 'root',
34 |     #    group   => 'root',
35 |     #    require => File[$install_dir],
36 |     #} 
37 | 
38 | 
39 |     # Create log dir for logging.
40 |     file {'/var/log/spark':
41 |         ensure => directory,
42 |         owner   => 'root',
43 |         group   => 'root',
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/manifests/user.pp:
--------------------------------------------------------------------------------
 1 | class spark::user {
 2 |  
 3 |     group {'spark':
 4 |         ensure => present,
 5 |     }
 6 | 
 7 |     user {'spark':
 8 |         ensure  => present,
 9 |         shell   => '/bin/bash',
10 |         gid     => 'spark',
11 |         require => Group['spark'],
12 |     }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/manifests/worker.pp:
--------------------------------------------------------------------------------
 1 | class spark::worker (
 2 |     $master,
 3 |     $spark_service_status = 'running',
 4 |     $master_port = $::spark::defaults::master_port,
 5 |     $web_port    = $::spark::defaults::web_port,
 6 |     $install_dir = $::spark::defaults::install_dir,
 7 |     $cores       = $::spark::defaults::cores,
 8 |     $memory      = $::spark::defaults::memory,
 9 |     $scratch_dir = $::spark::defaults::scratch_dir,
10 | ) inherits spark::defaults {
11 | 
12 |     class {'spark':
13 |         master      => $master,
14 |         install_dir => $install_dir,
15 |         worker_mem  => $memory,
16 | 
17 |     }
18 |     Class['spark'] -> Class['spark::worker']
19 | 
20 |     # The Upstart service file.
21 |     file {'/etc/init/spark-worker.conf':
22 |         content => template('spark/spark-worker.conf.erb'),
23 |         mode    => '0644',
24 |         owner   => 'root',
25 |         group   => 'root',
26 |         notify  => Service['spark-worker'], 
27 |     } 
28 | 
29 |     file { "${install_dir}/bin/spark-worker-runner.sh":
30 |         content => template('spark/spark-worker-runner.sh.erb'),
31 |         owner   => 'root',
32 |         group   => 'root',
33 |         mode    => '0744',
34 |     }
35 | 
36 |     # The service that runs the master server. 
37 |     service {'spark-worker': 
38 |         ensure   => $spark_service_status, 
39 |         #provider => 'upstart',
40 |         require  => [File['/etc/init/spark-worker.conf'], File["${install_dir}/bin/spark-worker-runner.sh"]],
41 |         hasrestart => true,
42 |         hasstatus => true,
43 |         restart => '/sbin/initctl restart spark-worker',
44 |         start => '/sbin/initctl start spark-worker',
45 |         stop => '/sbin/initctl stop spark-worker',
46 |         status => '/sbin/initctl status spark-worker | grep "/running" 1>/dev/null 2>&1'
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/templates/spark-env.sh.erb:
--------------------------------------------------------------------------------
 1 | #export SCALA_HOME=/opt/scala-2.9.3
 2 | 
 3 | <%# SPARK_MASTER_OPTS="-Dspark.deploy.spreadOut=false" %>
 4 | SPARK_JAVA_OPTS+=" -Dspark.local.dir=/raid/spark-local"
 5 | <%# SPARK_JAVA_OPTS+=" -Dspark.speculation=true" %>
 6 | #SPARK_JAVA_OPTS+="-XX:MaxPermSize=512m"
 7 | #SPARK_JAVA_OPTS+="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=9012 -Dcom.sun.management.jmxremote.local.only=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false"
 8 | #export SPARK_JAVA_OPTS
 9 | 
10 | export SPARK_MEM=<%= @worker_mem %>
11 | export SPARK_DAEMON_MEMORY=1g
12 | export SPARK_LIBRARY_PATH="/usr/lib/hadoop/lib/native:$SPARK_LIBRARY_PATH"
13 | export SPARK_CLASSPATH="/usr/lib/tachyon/target/tachyon-0.4.1-jar-with-dependencies.jar:$SPARK_CLASSPATH"
14 | 
15 | # Bind Spark's web UIs to this machine's public EC2 hostname:
16 | #export SPARK_PUBLIC_DNS=`wget -q -O - http://instance-data.ec2.internal/latest/meta-data/public-hostname`
17 | 
18 | # Set a high ulimit for large shuffles
19 | ulimit -n 1000000
20 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/templates/spark-master-runner.sh.erb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function borrowed from the original Spark source.
 4 | spark_rotate_log ()
 5 | {
 6 |     log=/var/log/spark/master.log;
 7 |     num=5;
 8 |     if [ -n "$2" ]; then                                                                  
 9 |         num=$2                                                                            
10 |     fi
11 |     if [ -f "$log" ]; then # rotate logs
12 |         while [ $num -gt 1 ]; do                                                          
13 |             prev=`expr $num - 1`
14 |             [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"                            
15 |             num=$prev                                                                     
16 |         done                                                                              
17 |         mv "$log" "$log.$num";                                                            
18 |     fi                                                                                    
19 | }
20 | 
21 | spark_rotate_log
22 | <%= @install_dir %>/bin/spark-class org.apache.spark.deploy.master.Master --ip <%= @fqdn %> --webui-port <%= @web_port %> --port <%= @master_port %> >> /var/log/spark/master.log 2>&1 < /dev/null
23 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/templates/spark-master.conf.erb:
--------------------------------------------------------------------------------
 1 | description "Spark Master Service script"
 2 | start on runlevel [2345]
 3 | stop on runlevel [06]
 4 | #setuid root
 5 | #setgid root
 6 | #console log
 7 | 
 8 | chdir <%= @install_dir %>
 9 | exec <%= @install_dir %>/bin/spark-master-runner.sh
10 | 
11 | # Try to respawn with a maximum of 10 times in a 90 second window.
12 | respawn
13 | respawn limit 10 90
14 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/templates/spark-worker-runner.sh.erb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function borrowed from the original Spark source.
 4 | spark_rotate_log ()
 5 | {
 6 |     log=/var/log/spark/worker.log;
 7 |     num=5;
 8 |     if [ -n "$2" ]; then                                                                  
 9 |         num=$2                                                                            
10 |     fi
11 |     if [ -f "$log" ]; then # rotate logs
12 |         while [ $num -gt 1 ]; do                                                          
13 |             prev=`expr $num - 1`
14 |             [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"                            
15 |             num=$prev                                                                     
16 |         done                                                                              
17 |         mv "$log" "$log.$num";                                                            
18 |     fi                                                                                    
19 | }
20 | 
21 | spark_rotate_log
22 | 
23 | export SPARK_LIBRARY_PATH="/usr/lib/hadoop/lib/native:$SPARK_LIBRARY_PATH"
24 | 
25 | <%= @install_dir %>/bin/spark-class org.apache.spark.deploy.worker.Worker spark://<%= @master %>:<%= @master_port %> --work-dir <%= @scratch_dir %><% if @cores -%> --cores <%= @cores %><% end -%><% if @memory -%> --memory <%= @memory %><% end -%> >> /var/log/spark/worker.log 2>&1 < /dev/null
26 | 


--------------------------------------------------------------------------------
/initial-deployment-puppet/modules/spark/templates/spark-worker.conf.erb:
--------------------------------------------------------------------------------
 1 | description "Spark Worker Service script"
 2 | start on runlevel [2345]
 3 | stop on runlevel [06]
 4 | #setuid root
 5 | #setgid root
 6 | #console log
 7 | 
 8 | chdir <%= @install_dir %>
 9 | exec <%= @install_dir %>/bin/spark-worker-runner.sh
10 | 
11 | # Try to respawn with a maximum of 10 times in a 90 second window.
12 | respawn
13 | respawn limit 10 90
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYaml==3.11
2 | 


--------------------------------------------------------------------------------
/sample-application/.gitignore:
--------------------------------------------------------------------------------
1 | stderr.txt
2 | stdout.txt
3 | target
4 | project/{project,target}
5 | *.pickle
6 | *.log
7 | 


--------------------------------------------------------------------------------
/sample-application/build.sbt:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////
 2 | //
 3 | // Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved.
 4 | //
 5 | // Licensed under the Apache License, Version 2.0 (the "License");
 6 | // you may not use this file except in compliance with the License.
 7 | // You may obtain a copy of the License at
 8 | //
 9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing, software
12 | // distributed under the License is distributed on an "AS IS" BASIS,
13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | // See the License for the specific language governing permissions and
15 | // limitations under the License.
16 | //
17 | ///////////////////////////////////////////////////////////////////////////
18 | 
19 | import AssemblyKeys._
20 | 
21 | assemblySettings
22 | 
23 | jarName in assembly := "ExampleApp.jar"
24 | 
25 | name := "Example App"
26 | 
27 | version := "1.0"
28 | 
29 | scalaVersion := "2.10.3"
30 | 
31 | // Load "provided" libraries with `sbt run`.
32 | run in Compile <<= Defaults.runTask(
33 |   fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)
34 | )
35 | 
36 | libraryDependencies ++= Seq(
37 |   "org.apache.spark" %% "spark-core" % "1.0.0" % "provided",
38 |   "org.slf4j" % "slf4j-simple" % "1.7.7" // Logging.
39 | )
40 | 
41 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
42 | 


--------------------------------------------------------------------------------
/sample-application/config.yaml.tmpl:
--------------------------------------------------------------------------------
 1 | ###########################################################################
 2 | ##
 3 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved.
 4 | ##
 5 | ## Licensed under the Apache License, Version 2.0 (the "License");
 6 | ## you may not use this file except in compliance with the License.
 7 | ## You may obtain a copy of the License at
 8 | ##
 9 | ## http://www.apache.org/licenses/LICENSE-2.0
10 | ##
11 | ## Unless required by applicable law or agreed to in writing, software
12 | ## distributed under the License is distributed on an "AS IS" BASIS,
13 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | ## See the License for the specific language governing permissions and
15 | ## limitations under the License.
16 | ##
17 | ###########################################################################
18 | 
19 | jar: ExampleApp.jar
20 | local_jar_dir: target/scala-2.10/
21 | remote_jar_dir: /tmp/
22 | main_class: com.adobe.ExampleApp
23 | remote_spark_dir: /usr/lib/spark
24 | spark_master: spark://server_hostname:7077
25 | spark_work: /raid/spark-work
26 | 


--------------------------------------------------------------------------------
/sample-application/src/main/scala/ExampleApp.scala:
--------------------------------------------------------------------------------
 1 | ///////////////////////////////////////////////////////////////////////////
 2 | //
 3 | // Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved.
 4 | //
 5 | // Licensed under the Apache License, Version 2.0 (the "License");
 6 | // you may not use this file except in compliance with the License.
 7 | // You may obtain a copy of the License at
 8 | //
 9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing, software
12 | // distributed under the License is distributed on an "AS IS" BASIS,
13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | // See the License for the specific language governing permissions and
15 | // limitations under the License.
16 | //
17 | ///////////////////////////////////////////////////////////////////////////
18 | 
19 | package com.adobe
20 | 
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.SparkContext
23 | import org.apache.spark.SparkContext._
24 | 
25 | import java.io.{File,PrintWriter}
26 | 
27 | object ExampleApp {
28 |   def main(args: Array[String]) {
29 |     val conf = new SparkConf()
30 |       .setAppName("ExampleApp")
31 |       .setMaster("spark://spark_master_hostname:7077")
32 |       .setSparkHome("/usr/lib/spark")
33 |       .setJars(Seq("/tmp/ExampleApp.jar"))
34 |       .set("spark.executor.memory", "10g")
35 |       .set("spark.cores.max", "4")
36 |     val sc = new SparkContext(conf)
37 |     val nums = sc.parallelize(Seq(1,2,4,8))
38 |     val squares = nums.map{case num => num*num}
39 |     println("Nums: " + nums.collect().mkString(", "))
40 |     println("Squares: " + squares.collect().mkString(", "))
41 |     sc.stop()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------