├── .gitignore ├── AUTHORS ├── LICENSE ├── README.md ├── application-deployment-fabfile.py ├── config.yaml.tmpl ├── env.sh ├── images ├── application-deployment-1.png ├── initial-deployment-1.png ├── initial-deployment-2.png └── initial-deployment-3.png ├── initial-deployment-fabfile.py ├── initial-deployment-puppet ├── manifests │ ├── adobe_hadoop.pp.tmpl │ ├── hdfs-master.pp │ ├── hdfs-worker.pp │ ├── spark-master.pp │ └── spark-worker.pp.tmpl └── modules │ ├── cdh4 │ ├── .gitreview │ ├── LICENSE │ ├── README.md │ ├── TODO.md │ ├── files │ │ └── hue │ │ │ └── hue.init.d.sh │ ├── manifests │ │ ├── hadoop.pp │ │ ├── hadoop │ │ │ ├── datanode.pp │ │ │ ├── defaults.pp │ │ │ ├── directory.pp │ │ │ ├── historyserver.pp │ │ │ ├── jmxtrans │ │ │ │ ├── README.md │ │ │ │ ├── datanode.pp │ │ │ │ ├── master.pp │ │ │ │ ├── namenode.pp │ │ │ │ ├── nodemanager.pp │ │ │ │ ├── resourcemanager.pp │ │ │ │ └── worker.pp │ │ │ ├── jobtracker.pp │ │ │ ├── journalnode.pp │ │ │ ├── master.pp │ │ │ ├── namenode.pp │ │ │ ├── namenode │ │ │ │ ├── primary.pp │ │ │ │ └── standby.pp │ │ │ ├── nodemanager.pp │ │ │ ├── resourcemanager.pp │ │ │ ├── tasktracker.pp │ │ │ ├── worker.pp │ │ │ └── worker │ │ │ │ └── paths.pp │ │ ├── hcatalog.pp │ │ ├── hive.pp │ │ ├── hive │ │ │ ├── defaults.pp │ │ │ ├── master.pp │ │ │ ├── metastore.pp │ │ │ ├── metastore │ │ │ │ └── mysql.pp │ │ │ └── server.pp │ │ ├── hue.pp │ │ ├── hue │ │ │ └── defaults.pp │ │ ├── oozie.pp │ │ ├── oozie │ │ │ ├── database │ │ │ │ └── mysql.pp │ │ │ ├── defaults.pp │ │ │ └── server.pp │ │ ├── pig.pp │ │ └── sqoop.pp │ ├── templates │ │ ├── hadoop │ │ │ ├── core-site.xml.erb │ │ │ ├── hadoop-env.sh.erb │ │ │ ├── hadoop-metrics2.properties.erb │ │ │ ├── hdfs-site.xml.erb │ │ │ ├── httpfs-site.xml.erb │ │ │ ├── log4j.properties.erb │ │ │ ├── mapred-site.xml.erb │ │ │ ├── yarn-env.sh.erb │ │ │ └── yarn-site.xml.erb │ │ ├── hive │ │ │ ├── hive-exec-log4j.properties.erb │ │ │ └── hive-site.xml.erb │ │ ├── hue │ │ │ └── hue.ini.erb │ │ ├── oozie │ │ │ ├── oozie-env.sh.erb │ │ │ └── oozie-site.xml.erb │ │ └── pig │ │ │ └── pig.properties.erb │ └── tests │ │ ├── Makefile │ │ ├── datanode.pp │ │ ├── defaults.pp │ │ ├── hadoop.pp │ │ ├── historyserver.pp │ │ ├── hive.pp │ │ ├── hive_master.pp │ │ ├── hive_metastore.pp │ │ ├── hive_metastore_mysql.pp │ │ ├── hive_server.pp │ │ ├── jobtracker.pp │ │ ├── master.pp │ │ ├── namenode.pp │ │ ├── namenode_primary.pp │ │ ├── namenode_standby.pp │ │ ├── nodemanager.pp │ │ ├── pig.pp │ │ ├── resourcemanager.pp │ │ ├── sqoop.pp │ │ ├── tasktracker.pp │ │ └── worker.pp │ └── spark │ ├── LICENSE │ ├── README.md │ ├── files │ └── spark │ │ ├── CHANGES.txt │ │ ├── LICENSE │ │ ├── NOTICE │ │ ├── README.md │ │ ├── RELEASE │ │ ├── bin │ │ ├── compute-classpath.cmd │ │ ├── compute-classpath.sh │ │ ├── load-spark-env.sh │ │ ├── pyspark │ │ ├── pyspark.cmd │ │ ├── pyspark2.cmd │ │ ├── run-example │ │ ├── run-example.cmd │ │ ├── run-example2.cmd │ │ ├── spark-class │ │ ├── spark-class.cmd │ │ ├── spark-class2.cmd │ │ ├── spark-shell │ │ ├── spark-shell.cmd │ │ ├── spark-submit │ │ └── spark-submit.cmd │ │ ├── conf │ │ ├── fairscheduler.xml.template │ │ ├── log4j.properties.template │ │ ├── metrics.properties.template │ │ ├── slaves │ │ ├── spark-defaults.conf.template │ │ └── spark-env.sh.template │ │ ├── ec2 │ │ ├── README │ │ ├── deploy.generic │ │ │ └── root │ │ │ │ └── spark-ec2 │ │ │ │ └── ec2-variables.sh │ │ ├── spark-ec2 │ │ ├── spark_ec2.py │ │ └── third_party │ │ │ └── boto-2.4.1.zip │ │ ├── examples │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── examples │ │ │ │ ├── JavaHdfsLR.java │ │ │ │ ├── JavaLogQuery.java │ │ │ │ ├── JavaPageRank.java │ │ │ │ ├── JavaSparkPi.java │ │ │ │ ├── JavaTC.java │ │ │ │ ├── JavaWordCount.java │ │ │ │ ├── mllib │ │ │ │ ├── JavaALS.java │ │ │ │ ├── JavaKMeans.java │ │ │ │ └── JavaLR.java │ │ │ │ ├── sql │ │ │ │ └── JavaSparkSQL.java │ │ │ │ └── streaming │ │ │ │ ├── JavaCustomReceiver.java │ │ │ │ ├── JavaFlumeEventCount.java │ │ │ │ ├── JavaKafkaWordCount.java │ │ │ │ ├── JavaNetworkWordCount.java │ │ │ │ └── JavaQueueStream.java │ │ │ ├── python │ │ │ ├── als.py │ │ │ ├── kmeans.py │ │ │ ├── logistic_regression.py │ │ │ ├── mllib │ │ │ │ ├── kmeans.py │ │ │ │ └── logistic_regression.py │ │ │ ├── pagerank.py │ │ │ ├── pi.py │ │ │ ├── sort.py │ │ │ ├── transitive_closure.py │ │ │ └── wordcount.py │ │ │ ├── resources │ │ │ ├── kv1.txt │ │ │ └── people.txt │ │ │ └── scala │ │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ └── examples │ │ │ ├── BroadcastTest.scala │ │ │ ├── CassandraCQLTest.scala │ │ │ ├── CassandraTest.scala │ │ │ ├── DriverSubmissionTest.scala │ │ │ ├── ExceptionHandlingTest.scala │ │ │ ├── GroupByTest.scala │ │ │ ├── HBaseTest.scala │ │ │ ├── HdfsTest.scala │ │ │ ├── LocalALS.scala │ │ │ ├── LocalFileLR.scala │ │ │ ├── LocalKMeans.scala │ │ │ ├── LocalLR.scala │ │ │ ├── LocalPi.scala │ │ │ ├── LogQuery.scala │ │ │ ├── MultiBroadcastTest.scala │ │ │ ├── SimpleSkewedGroupByTest.scala │ │ │ ├── SkewedGroupByTest.scala │ │ │ ├── SparkALS.scala │ │ │ ├── SparkHdfsLR.scala │ │ │ ├── SparkKMeans.scala │ │ │ ├── SparkLR.scala │ │ │ ├── SparkPageRank.scala │ │ │ ├── SparkPi.scala │ │ │ ├── SparkTC.scala │ │ │ ├── SparkTachyonHdfsLR.scala │ │ │ ├── SparkTachyonPi.scala │ │ │ ├── bagel │ │ │ ├── PageRankUtils.scala │ │ │ ├── WikipediaPageRank.scala │ │ │ └── WikipediaPageRankStandalone.scala │ │ │ ├── graphx │ │ │ └── LiveJournalPageRank.scala │ │ │ ├── mllib │ │ │ ├── BinaryClassification.scala │ │ │ ├── DecisionTreeRunner.scala │ │ │ ├── DenseKMeans.scala │ │ │ ├── LinearRegression.scala │ │ │ ├── MovieLensALS.scala │ │ │ ├── SparseNaiveBayes.scala │ │ │ ├── TallSkinnyPCA.scala │ │ │ └── TallSkinnySVD.scala │ │ │ ├── sql │ │ │ ├── RDDRelation.scala │ │ │ └── hive │ │ │ │ └── HiveFromSpark.scala │ │ │ └── streaming │ │ │ ├── ActorWordCount.scala │ │ │ ├── CustomReceiver.scala │ │ │ ├── FlumeEventCount.scala │ │ │ ├── HdfsWordCount.scala │ │ │ ├── KafkaWordCount.scala │ │ │ ├── MQTTWordCount.scala │ │ │ ├── NetworkWordCount.scala │ │ │ ├── QueueStream.scala │ │ │ ├── RawNetworkGrep.scala │ │ │ ├── RecoverableNetworkWordCount.scala │ │ │ ├── StatefulNetworkWordCount.scala │ │ │ ├── StreamingExamples.scala │ │ │ ├── TwitterAlgebirdCMS.scala │ │ │ ├── TwitterAlgebirdHLL.scala │ │ │ ├── TwitterPopularTags.scala │ │ │ ├── ZeroMQWordCount.scala │ │ │ └── clickstream │ │ │ ├── PageViewGenerator.scala │ │ │ └── PageViewStream.scala │ │ ├── lib │ │ ├── spark-assembly.1 │ │ └── spark-assembly.2 │ │ ├── python │ │ ├── .gitignore │ │ ├── epydoc.conf │ │ ├── lib │ │ │ ├── PY4J_LICENSE.txt │ │ │ └── py4j-0.8.1-src.zip │ │ ├── pyspark │ │ │ ├── __init__.py │ │ │ ├── accumulators.py │ │ │ ├── broadcast.py │ │ │ ├── cloudpickle.py │ │ │ ├── conf.py │ │ │ ├── context.py │ │ │ ├── daemon.py │ │ │ ├── files.py │ │ │ ├── java_gateway.py │ │ │ ├── join.py │ │ │ ├── mllib │ │ │ │ ├── __init__.py │ │ │ │ ├── _common.py │ │ │ │ ├── classification.py │ │ │ │ ├── clustering.py │ │ │ │ ├── linalg.py │ │ │ │ ├── recommendation.py │ │ │ │ ├── regression.py │ │ │ │ ├── tests.py │ │ │ │ └── util.py │ │ │ ├── rdd.py │ │ │ ├── rddsampler.py │ │ │ ├── resultiterable.py │ │ │ ├── serializers.py │ │ │ ├── shell.py │ │ │ ├── sql.py │ │ │ ├── statcounter.py │ │ │ ├── storagelevel.py │ │ │ ├── tests.py │ │ │ └── worker.py │ │ ├── run-tests │ │ └── test_support │ │ │ ├── hello.txt │ │ │ ├── userlib-0.1-py2.7.egg │ │ │ └── userlibrary.py │ │ └── sbin │ │ ├── slaves.sh │ │ ├── spark-config.sh │ │ ├── spark-daemon.sh │ │ ├── spark-daemons.sh │ │ ├── spark-executor │ │ ├── start-all.sh │ │ ├── start-history-server.sh │ │ ├── start-master.sh │ │ ├── start-slave.sh │ │ ├── start-slaves.sh │ │ ├── stop-all.sh │ │ ├── stop-history-server.sh │ │ ├── stop-master.sh │ │ └── stop-slaves.sh │ ├── manifests │ ├── defaults.pp │ ├── master.pp │ ├── spark.pp │ ├── user.pp │ └── worker.pp │ └── templates │ ├── metrics.properties.erb │ ├── spark-env.sh.erb │ ├── spark-master-runner.sh.erb │ ├── spark-master.conf.erb │ ├── spark-worker-runner.sh.erb │ └── spark-worker.conf.erb ├── requirements.txt └── sample-application ├── .gitignore ├── build.sbt ├── config.yaml.tmpl └── src └── main └── scala └── ExampleApp.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | config.yaml 3 | project 4 | target 5 | adobe_hadoop.pp 6 | spark-worker.pp 7 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Brandon Amos 2 | David Tompkins 3 | -------------------------------------------------------------------------------- /config.yaml.tmpl: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | ## 3 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved. 4 | ## 5 | ## Licensed under the Apache License, Version 2.0 (the "License"); 6 | ## you may not use this file except in compliance with the License. 7 | ## You may obtain a copy of the License at 8 | ## 9 | ## http://www.apache.org/licenses/LICENSE-2.0 10 | ## 11 | ## Unless required by applicable law or agreed to in writing, software 12 | ## distributed under the License is distributed on an "AS IS" BASIS, 13 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ## See the License for the specific language governing permissions and 15 | ## limitations under the License. 16 | ## 17 | ########################################################################### 18 | 19 | master: 20 | - server0 21 | all: &all 22 | - server0 23 | - server1 24 | - server2 25 | - server3 26 | - server4 27 | - server5 28 | workers: 29 | *all 30 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | # env.sh 2 | # Source this script for Spark standalone deployment shell functions. 3 | # 4 | ########################################################################### 5 | ## 6 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved. 7 | ## 8 | ## Licensed under the Apache License, Version 2.0 (the "License"); 9 | ## you may not use this file except in compliance with the License. 10 | ## You may obtain a copy of the License at 11 | ## 12 | ## http://www.apache.org/licenses/LICENSE-2.0 13 | ## 14 | ## Unless required by applicable law or agreed to in writing, software 15 | ## distributed under the License is distributed on an "AS IS" BASIS, 16 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | ## See the License for the specific language governing permissions and 18 | ## limitations under the License. 19 | ## 20 | ########################################################################### 21 | 22 | DEPLOY_DIR="$( cd "$( dirname "$0" )" && pwd )" 23 | INITIAL_DIR=$DEPLOY_DIR+"/initial-deployment" 24 | APPLICATION_DIR=$DEPLOY_DIR+"/application-deployment" 25 | 26 | # Initial deployment shell aliases/functions. 27 | function spark-init() { 28 | fab -f $DEPLOY_DIR/initial-deployment-fabfile.py $* 29 | } 30 | 31 | alias si='spark-init' 32 | alias si-list='spark-init -list' 33 | alias si-start-hm='spark-init startHdfsMaster' 34 | alias si-start-hw='spark-init startHdfsWorkers' 35 | alias si-start-sm='spark-init startSparkMaster' 36 | alias si-start-sw='spark-init startSparkWorkers' 37 | alias si-stop-hm='spark-init stopHdfsMaster' 38 | alias si-stop-hw='spark-init stopHdfsWorkers' 39 | alias si-stop-sm='spark-init stopSparkMaster' 40 | alias si-stop-sw='spark-init stopSparkWorkers' 41 | 42 | # Application deployment shell aliases/functions. 43 | function spark-submit() { 44 | fab -f $DEPLOY_DIR/application-deployment-fabfile.py $* 45 | } 46 | 47 | alias ss='spark-submit' 48 | alias ss-list='spark-submit -list' 49 | alias ss-sy='spark-submit sync' 50 | alias ss-st='spark-submit start' 51 | alias ss-a='spark-submit assembly' 52 | alias ss-ss='spark-submit sync start' 53 | alias ss-o='spark-submit getOutput' 54 | alias ss-k='spark-submit kill' 55 | -------------------------------------------------------------------------------- /images/application-deployment-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/application-deployment-1.png -------------------------------------------------------------------------------- /images/initial-deployment-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-1.png -------------------------------------------------------------------------------- /images/initial-deployment-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-2.png -------------------------------------------------------------------------------- /images/initial-deployment-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/images/initial-deployment-3.png -------------------------------------------------------------------------------- /initial-deployment-puppet/manifests/adobe_hadoop.pp.tmpl: -------------------------------------------------------------------------------- 1 | class adobe::hadoop_base { 2 | class { 'cdh4::hadoop': 3 | namenode_hosts => ['namenode_server'], 4 | datanode_mounts => [ 5 | '/raid/hadoop/data' 6 | ], 7 | dfs_name_dir => '/raid/hadoop/name' 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /initial-deployment-puppet/manifests/hdfs-master.pp: -------------------------------------------------------------------------------- 1 | import "adobe_hadoop" 2 | 3 | node default { 4 | include adobe::hadoop_base 5 | include cdh4::hadoop::master 6 | } 7 | -------------------------------------------------------------------------------- /initial-deployment-puppet/manifests/hdfs-worker.pp: -------------------------------------------------------------------------------- 1 | import "adobe_hadoop" 2 | 3 | node default { 4 | include adobe::hadoop_base 5 | include cdh4::hadoop::worker 6 | } 7 | -------------------------------------------------------------------------------- /initial-deployment-puppet/manifests/spark-master.pp: -------------------------------------------------------------------------------- 1 | node default { 2 | class { 'spark::master': 3 | worker_mem => '22g' 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /initial-deployment-puppet/manifests/spark-worker.pp.tmpl: -------------------------------------------------------------------------------- 1 | node default { 2 | class { 'spark::worker': 3 | master => 'namenode_server', 4 | memory => '22g', 5 | scratch_dir => "/raid/spark-work" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/.gitreview: -------------------------------------------------------------------------------- 1 | [gerrit] 2 | host=gerrit.wikimedia.org 3 | port=29418 4 | project=operations/puppet/cdh4.git 5 | defaultbranch=master 6 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | =============== 3 | 4 | Copyright (c) 2013 Andrew Otto , the Wikimedia Foundation. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/TODO.md: -------------------------------------------------------------------------------- 1 | **Table of Contents** *generated with [DocToc](http://doctoc.herokuapp.com/)* 2 | 3 | - [TODO:](#todo) 4 | - [Hadoop](#hadoop) 5 | - [HBase](#hbase) 6 | - [Zookeeper](#zookeeper) 7 | 8 | # TODO: 9 | 10 | ## Hadoop 11 | 12 | - Add hosts.exclude support for decommissioning nodes. 13 | - Change cluster (conf) name? (use update-alternatives?) 14 | - Set default # map/reduce tasks automatically based on facter node stats. 15 | - Handle ensure => absent, especially for MRv1 vs YARN packages and services. 16 | - Implement standalone yarn proxyserver support. 17 | - Make log4j.properties more configurable. 18 | - Support Secondary NameNode. 19 | - Make JMX ports configurable. 20 | - Make hadoop-metrics2.properties more configurable. 21 | - Support HA automatic failover. 22 | - HA NameNode Fencing support. 23 | - Rename 'use_yarn' parameter to 'yarn_enabled' for consistency. 24 | 25 | ## HBase 26 | - Implement. 27 | 28 | ## Zookeeper 29 | 30 | Won't implement. A Zookeeper package is available upstream in Debian/Ubuntu. 31 | Puppetization for this package can be found at 32 | https://github.com/wikimedia/operations-puppet-zookeeper 33 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/datanode.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::datanode 2 | # Installs and starts up a Hadoop DataNode. 3 | # 4 | class cdh4::hadoop::datanode { 5 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::datanode'] 6 | 7 | # install jobtracker daemon package 8 | package { 'hadoop-hdfs-datanode': 9 | ensure => 'installed' 10 | } 11 | 12 | # install datanode daemon package 13 | service { 'hadoop-hdfs-datanode': 14 | ensure => 'running', 15 | enable => true, 16 | hasstatus => true, 17 | hasrestart => true, 18 | alias => 'datanode', 19 | require => Package['hadoop-hdfs-datanode'], 20 | } 21 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/defaults.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::defaults 2 | # Default parameters for cdh4::hadoop configuration. 3 | # 4 | class cdh4::hadoop::defaults { 5 | $config_directory = '/etc/hadoop/conf' 6 | 7 | $nameservice_id = undef 8 | $journalnode_hosts = undef 9 | $dfs_journalnode_edits_dir = undef 10 | 11 | $datanode_mounts = undef 12 | $dfs_data_path = 'hdfs/dn' 13 | $yarn_local_path = 'yarn/local' 14 | $yarn_logs_path = 'yarn/logs' 15 | $dfs_block_size = 67108864 # 64MB default 16 | $enable_jmxremote = true 17 | $enable_webhdfs = true 18 | $mapreduce_system_dir = undef 19 | $io_file_buffer_size = undef 20 | $mapreduce_map_tasks_maximum = undef 21 | $mapreduce_reduce_tasks_maximum = undef 22 | $mapreduce_job_reuse_jvm_num_tasks = undef 23 | $mapreduce_reduce_shuffle_parallelcopies = undef 24 | $mapreduce_map_memory_mb = undef 25 | $mapreduce_reduce_memory_mb = undef 26 | $mapreduce_task_io_sort_mb = undef 27 | $mapreduce_task_io_sort_factor = undef 28 | $mapreduce_map_java_opts = undef 29 | $mapreduce_reduce_java_opts = undef 30 | $mapreduce_shuffle_port = undef 31 | $mapreduce_intermediate_compression = false 32 | $mapreduce_intermediate_compression_codec = 'org.apache.hadoop.io.compress.DefaultCodec' 33 | $mapreduce_output_compression = false 34 | $mapreduce_output_compression_codec = 'org.apache.hadoop.io.compress.DefaultCodec' 35 | $mapreduce_output_compression_type = 'RECORD' 36 | $yarn_nodemanager_resource_memory_mb = undef 37 | $yarn_resourcemanager_scheduler_class = undef 38 | $use_yarn = true 39 | $ganglia_hosts = undef 40 | $net_topology_script_template = undef 41 | 42 | 43 | # JMX Ports (These are not currently configurable) 44 | $namenode_jmxremote_port = 9980 45 | $datanode_jmxremote_port = 9981 46 | $resourcemanager_jmxremote_port = 9983 47 | $nodemanager_jmxremote_port = 9984 48 | $proxyserver_jmxremote_port = 9985 49 | } 50 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/directory.pp: -------------------------------------------------------------------------------- 1 | # == Define cdh4::hadoop::directory 2 | # 3 | # Creates or removes a directory in HDFS. 4 | # 5 | # == Notes: 6 | # This will not check ownership and permissions 7 | # of a directory. It will only check for the directories 8 | # existence. If it does not exist, the directory will be 9 | # created and given specified ownership and permissions. 10 | # This will not attempt to set ownership and permissions 11 | # if the directory already exists. 12 | # 13 | # This define does not support managing files in HDFS, 14 | # only directories. 15 | # 16 | # Ideally this define would be ported into a Puppet File Provider. 17 | # I once spent some time trying to make that work, but it was more 18 | # difficult than it sounds. For example, you'd need to handle conversion 19 | # between symbolic mode to numeric mode, as I could not find a way to 20 | # get hadoop fs to list numeric modes for comparison. Perhaps 21 | # there's a way to use HttpFS to do this instead? 22 | # 23 | # == Parameters: 24 | # $path - HDFS directory path. Default: $title 25 | # $ensure - present|absent. Default: present 26 | # $owner - HDFS directory owner. Default: hdfs 27 | # $group - HDFS directory group owner. Default: hdfs 28 | # $mode - HDFS diretory mode. Default 0755 29 | # 30 | define cdh4::hadoop::directory ( 31 | $path = $title, 32 | $ensure = 'present', 33 | $owner = 'hdfs', 34 | $group = 'hdfs', 35 | $mode = '0755') 36 | { 37 | Class['cdh4::hadoop'] -> Cdh4::Hadoop::Directory[$title] 38 | 39 | if $ensure == 'present' { 40 | exec { "cdh4::hadoop::directory ${title}": 41 | command => "/usr/bin/hadoop fs -mkdir ${path} && /usr/bin/hadoop fs -chmod ${mode} ${path} && /usr/bin/hadoop fs -chown ${owner}:${group} ${path}", 42 | unless => "/usr/bin/hadoop fs -test -e ${path}", 43 | user => 'hdfs', 44 | } 45 | } 46 | else { 47 | exec { "cdh4::hadoop::directory ${title}": 48 | command => "/usr/bin/hadoop fs -rm -R ${path}", 49 | onlyif => "/usr/bin/hadoop fs -test -e ${path}", 50 | user => 'hdfs', 51 | require => Service['hadoop-hdfs-namenode'], 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/historyserver.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::historyserver 2 | # Installs and starts up a Hadoop YARN HistoryServer. 3 | # This will ensure that the HDFS /user/history exists. 4 | # This class may only be included on the NameNode Master 5 | # Hadoop node. 6 | # 7 | class cdh4::hadoop::historyserver { 8 | Class['cdh4::hadoop::namenode'] -> Class['cdh4::hadoop::historyserver'] 9 | 10 | if !$::cdh4::hadoop::use_yarn { 11 | fail('Cannot use Hadoop YARN NodeManager if cdh4::hadoop::use_yarn is false.') 12 | } 13 | 14 | # Create HistoryServer HDfS directories. 15 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_11_4.html 16 | cdh4::hadoop::directory { '/user/history': 17 | # sudo -u hdfs hadoop fs -mkdir /user/history 18 | # sudo -u hdfs hadoop fs -chmod -R 1777 /user/history 19 | # sudo -u hdfs hadoop fs -chown yarn /user/history 20 | owner => 'yarn', 21 | group => 'hdfs', 22 | mode => '1777', 23 | # Make sure HDFS directories are created before 24 | # historyserver is installed and started, but after 25 | # the namenode. 26 | require => [Service['hadoop-hdfs-namenode'], Cdh4::Hadoop::Directory['/user']], 27 | } 28 | 29 | package { 'hadoop-mapreduce-historyserver': 30 | ensure => 'installed', 31 | require => Cdh4::Hadoop::Directory['/user/history'], 32 | } 33 | 34 | service { 'hadoop-mapreduce-historyserver': 35 | ensure => 'running', 36 | enable => true, 37 | hasstatus => true, 38 | hasrestart => true, 39 | alias => 'historyserver', 40 | require => Package['hadoop-mapreduce-historyserver'], 41 | } 42 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/README.md: -------------------------------------------------------------------------------- 1 | Hadoop very conveniently ships with built in Ganglia metrics reporter support. 2 | However, the GangliaContext class uses DatagramSocket instead of MulticastSocket. 3 | This will only work in Ganglia multicast setups where the there is no more than 4 | 1 network hop needed to get to the ganglia aggretagor(s) for your multicast group. 5 | See https://issues.apache.org/jira/browse/HADOOP-10181 for more details. 6 | 7 | Wikimedia uses a multi row VLAN setup for its Hadoop nodes, and needs a way 8 | to send Hadoop metrics to ganglia in a multicast setup. Jmxtrans supports 9 | this. These jmxtrans classes can be included to send a particular Hadoop 10 | service's metrics to Ganglia. 11 | 12 | # Usage 13 | 14 | On your Hadoop master node: 15 | 16 | ```puppet 17 | class { 'cdh4::hadoop::jmxtrans::master': 18 | ganglia => 'ganglia.example.com', 19 | } 20 | ``` 21 | 22 | On your Hadoop worker nodes: 23 | ```puppet 24 | class { 'cdh4::hadoop::jmxtrans::worker': 25 | ganglia => 'ganglia.example.com', 26 | } 27 | ``` 28 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/master.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::jmxtrans::master 2 | # Convenience class to include jmxtrans classes for NameNode and ResourceManager 3 | class cdh4::hadoop::jmxtrans::master( 4 | $ganglia = undef, 5 | $graphite = undef, 6 | $outfile = undef, 7 | ) 8 | { 9 | class { ['cdh4::hadoop::jmxtrans::namenode', 'cdh4::hadoop::jmxtrans::resourcemanager']: 10 | ganglia => $ganglia, 11 | graphite => $graphite, 12 | outfile => $outfile, 13 | } 14 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/jmxtrans/worker.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::jmxtrans::worker 2 | # Convenience class to include jmxtrans classes for DataNode and NodeManager 3 | class cdh4::hadoop::jmxtrans::worker( 4 | $ganglia = undef, 5 | $graphite = undef, 6 | $outfile = undef, 7 | ) 8 | { 9 | class { ['cdh4::hadoop::jmxtrans::datanode', 'cdh4::hadoop::jmxtrans::nodemanager']: 10 | ganglia => $ganglia, 11 | graphite => $graphite, 12 | outfile => $outfile, 13 | } 14 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/journalnode.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::journalnode 2 | # 3 | class cdh4::hadoop::journalnode { 4 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::journalnode'] 5 | 6 | # install jobtracker daemon package 7 | package { 'hadoop-hdfs-journalnode': 8 | ensure => 'installed' 9 | } 10 | 11 | # Ensure that the journanode edits directory has the correct permissions. 12 | file { $::cdh4::hadoop::dfs_journalnode_edits_dir: 13 | ensure => 'directory', 14 | owner => 'hdfs', 15 | group => 'hdfs', 16 | mode => '0755', 17 | require => Package['hadoop-hdfs-journalnode'], 18 | } 19 | 20 | # install datanode daemon package 21 | service { 'hadoop-hdfs-journalnode': 22 | ensure => 'running', 23 | enable => true, 24 | hasstatus => true, 25 | hasrestart => true, 26 | alias => 'journalnode', 27 | require => File[$::cdh4::hadoop::dfs_journalnode_edits_dir], 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/master.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::master 2 | # Wrapper class for Hadoop master node services: 3 | # - NameNode 4 | # - ResourceManager and HistoryServer (YARN) 5 | # OR 6 | # - JobTracker (MRv1). 7 | # 8 | class cdh4::hadoop::master { 9 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::master'] 10 | 11 | include cdh4::hadoop::namenode::primary 12 | 13 | # YARN uses ResourceManager and HistoryServer, 14 | # NOT JobTracker. 15 | if $::cdh4::hadoop::use_yarn { 16 | include cdh4::hadoop::resourcemanager 17 | include cdh4::hadoop::historyserver 18 | } 19 | # MRv1 just uses JobTracker 20 | else { 21 | include cdh4::hadoop::jobtracker 22 | } 23 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/namenode.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::namenode 2 | # Installs and configureds Hadoop NameNode. 3 | # This will format the NameNode if it is not 4 | # already formatted. It will also create 5 | # a common HDFS directory hierarchy. 6 | # 7 | # Note: If you are using HA NameNode (indicated by setting 8 | # cdh4::hadoop::nameservice_id), your JournalNodes should be running before 9 | # this class is applied. 10 | # 11 | class cdh4::hadoop::namenode { 12 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::namenode'] 13 | 14 | # install namenode daemon package 15 | package { 'hadoop-hdfs-namenode': 16 | ensure => installed 17 | } 18 | 19 | file { "${::cdh4::hadoop::config_directory}/hosts.exclude": 20 | ensure => 'present', 21 | require => Package['hadoop-hdfs-namenode'], 22 | } 23 | 24 | # Ensure that the namenode directory has the correct permissions. 25 | file { $::cdh4::hadoop::dfs_name_dir: 26 | ensure => 'directory', 27 | owner => 'hdfs', 28 | group => 'hdfs', 29 | mode => '0700', 30 | require => Package['hadoop-hdfs-namenode'], 31 | } 32 | 33 | # If $dfs_name_dir/current/VERSION doesn't exist, assume 34 | # NameNode has not been formated. Format it before 35 | # the namenode service is started. 36 | exec { 'hadoop-namenode-format': 37 | command => '/usr/bin/hdfs namenode -format', 38 | creates => "${::cdh4::hadoop::dfs_name_dir_main}/current/VERSION", 39 | user => 'hdfs', 40 | require => File[$::cdh4::hadoop::dfs_name_dir], 41 | } 42 | 43 | service { 'hadoop-hdfs-namenode': 44 | ensure => 'running', 45 | enable => true, 46 | hasstatus => true, 47 | hasrestart => true, 48 | alias => 'namenode', 49 | require => [File["${::cdh4::hadoop::config_directory}/hosts.exclude"], Exec['hadoop-namenode-format']], 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/namenode/standby.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::namenode::standby 2 | # Hadoop Standby NameNode. Include this class instead of 3 | # cdh4::hadoop::master on your HA standby NameNode(s). This 4 | # will bootstrap the standby dfs.name.dir with the contents 5 | # from your primary active NameNode. 6 | # 7 | # See README.md for more documentation. 8 | # 9 | # NOTE: Your JournalNodes should be running before this class is applied. 10 | # 11 | class cdh4::hadoop::namenode::standby inherits cdh4::hadoop::namenode { 12 | # Fail if nameservice_id isn't set. 13 | if (!$::cdh4::hadoop::ha_enabled) { 14 | fail('Cannot use Standby NameNode in a non HA setup. Set $nameservice_id on the cdh4::hadoop class to enable HA.') 15 | } 16 | 17 | # Override the namenode -format command to bootstrap this 18 | # standby NameNode's dfs.name.dir with the data from the 19 | # active NameNode. 20 | Exec['hadoop-namenode-format'] { 21 | command => '/usr/bin/hdfs namenode -bootstrapStandby', 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/nodemanager.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::nodemanager 2 | # Installs and configures a Hadoop NodeManager worker node. 3 | # 4 | class cdh4::hadoop::nodemanager { 5 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::nodemanager'] 6 | 7 | if !$::cdh4::hadoop::use_yarn { 8 | fail('Cannot use Hadoop YARN NodeManager if cdh4::hadoop::use_yarn is false.') 9 | } 10 | 11 | package { ['hadoop-yarn-nodemanager', 'hadoop-mapreduce']: 12 | ensure => 'installed', 13 | } 14 | 15 | # NodeManager (YARN TaskTracker) 16 | service { 'hadoop-yarn-nodemanager': 17 | ensure => 'running', 18 | enable => true, 19 | hasstatus => true, 20 | hasrestart => true, 21 | alias => 'nodemanager', 22 | require => [Package['hadoop-yarn-nodemanager', 'hadoop-mapreduce']], 23 | } 24 | } 25 | 26 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/resourcemanager.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::resourcemanager 2 | # Installs and configures Hadoop YARN ResourceManager. 3 | # This will create YARN HDFS directories. 4 | # 5 | class cdh4::hadoop::resourcemanager { 6 | Class['cdh4::hadoop::namenode'] -> Class['cdh4::hadoop::resourcemanager'] 7 | 8 | if !$::cdh4::hadoop::use_yarn { 9 | fail('Cannot use Hadoop YARN ResourceManager if cdh4::hadoop::use_yarn is false.') 10 | } 11 | 12 | # Create YARN HDFS directories. 13 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_11_4.html 14 | cdh4::hadoop::directory { '/var/log/hadoop-yarn': 15 | # sudo -u hdfs hadoop fs -mkdir /var/log/hadoop-yarn 16 | # sudo -u hdfs hadoop fs -chown yarn:mapred /var/log/hadoop-yarn 17 | owner => 'yarn', 18 | group => 'mapred', 19 | mode => '0755', 20 | # Make sure HDFS directories are created before 21 | # resourcemanager is installed and started, but after 22 | # the namenode. 23 | require => [Service['hadoop-hdfs-namenode'], Cdh4::Hadoop::Directory['/var/log']], 24 | } 25 | 26 | package { 'hadoop-yarn-resourcemanager': 27 | ensure => 'installed', 28 | require => Cdh4::Hadoop::Directory['/var/log/hadoop-yarn'], 29 | } 30 | 31 | service { 'hadoop-yarn-resourcemanager': 32 | ensure => 'running', 33 | enable => true, 34 | hasstatus => true, 35 | hasrestart => true, 36 | alias => 'resourcemanager', 37 | require => Package['hadoop-yarn-resourcemanager'], 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/tasktracker.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::tasktracker 2 | # Installs and configures Hadoop MRv1 TaskTracker. 3 | class cdh4::hadoop::tasktracker { 4 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::tasktracker'] 5 | 6 | if $::cdh4::hadoop::use_yarn { 7 | fail('Cannot use Hadoop MRv1 JobTrackker if cdh4::hadoop::use_yarn is true.') 8 | } 9 | 10 | # install tasktracker daemon package 11 | package { 'hadoop-0.20-mapreduce-tasktracker': 12 | ensure => 'installed' 13 | } 14 | 15 | service { 'hadoop-0.20-mapreduce-tasktracker': 16 | ensure => 'running', 17 | enable => true, 18 | hasstatus => true, 19 | hasrestart => true, 20 | alias => 'tasktracker', 21 | require => Package['hadoop-0.20-mapreduce-tasktracker'], 22 | } 23 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hadoop/worker.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hadoop::worker 2 | # Wrapper class for Hadoop Worker node services: 3 | # - DataNode 4 | # - NodeManager (YARN) 5 | # OR 6 | # - TaskTracker (MRv1) 7 | # 8 | # This class will attempt to create and manage the required 9 | # local worker directories defined in the $datanode_mounts array. 10 | # You must make sure that the paths defined in $datanode_mounts are 11 | # formatted and mounted properly yourself; The CDH4 module does not 12 | # manage them. 13 | # 14 | class cdh4::hadoop::worker { 15 | Class['cdh4::hadoop'] -> Class['cdh4::hadoop::worker'] 16 | 17 | cdh4::hadoop::worker::paths { $::cdh4::hadoop::datanode_mounts: } 18 | 19 | class { 'cdh4::hadoop::datanode': 20 | require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts], 21 | } 22 | 23 | # YARN uses NodeManager. 24 | if $::cdh4::hadoop::use_yarn { 25 | class { 'cdh4::hadoop::nodemanager': 26 | require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts], 27 | } 28 | } 29 | # MRv1 uses TaskTracker. 30 | else { 31 | class { 'cdh4::hadoop::tasktracker': 32 | require => Cdh4::Hadoop::Worker::Paths[$::cdh4::hadoop::datanode_mounts], 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hcatalog.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hcatalog 2 | # This class doesn't yet do anything other than 3 | # install the hcatalog package. This will be expanded 4 | # If/when we need more functionality (hcatalog-server, etc.), 5 | # 6 | class cdh4::hcatalog { 7 | package { 'hcatalog': 8 | ensure => 'installed', 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hive/defaults.pp: -------------------------------------------------------------------------------- 1 | # == Class hive::defaults 2 | # Default Hive configs 3 | # 4 | class cdh4::hive::defaults { 5 | $zookeeper_hosts = undef 6 | 7 | $jdbc_driver = 'com.mysql.jdbc.Driver' 8 | $jdbc_protocol = 'mysql' 9 | $jdbc_database = 'hive_metastore' 10 | $jdbc_host = 'localhost' 11 | $jdbc_port = 3306 12 | $jdbc_username = 'hive' 13 | $jdbc_password = 'hive' 14 | 15 | $db_root_username = undef 16 | $db_root_password = undef 17 | 18 | $exec_parallel_thread_number = 8 # set this to 0 to disable hive.exec.parallel 19 | $optimize_skewjoin = false 20 | $skewjoin_key = 10000 21 | $skewjoin_mapjoin_map_tasks = 10000 22 | $skewjoin_mapjoin_min_split = 33554432 23 | 24 | $stats_enabled = false 25 | $stats_dbclass = 'jdbc:derby' 26 | $stats_jdbcdriver = 'org.apache.derby.jdbc.EmbeddedDriver' 27 | $stats_dbconnectionstring = 'jdbc:derby:;databaseName=TempStatsStore;create=true' 28 | 29 | # Default puppet paths to template config files. 30 | # This allows us to use custom template config files 31 | # if we want to override more settings than this 32 | # module yet supports. 33 | $hive_site_template = 'cdh4/hive/hive-site.xml.erb' 34 | $hive_exec_log4j_template = 'cdh4/hive/hive-exec-log4j.properties.erb' 35 | 36 | # Further path/jar to add to hive's classpath 37 | # (Until Hive 0.12.0 this can only be a single path (see HIVE-2269 ) 38 | $auxpath = undef 39 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hive/master.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hive::master 2 | # Wrapper class for hive::server, hive::metastore, and hive::metastore::* databases. 3 | # 4 | # Include this class on your Hive master node with $metastore_database 5 | # set to one of the available metastore backend classes in the hive/metastore/ 6 | # directory. If you want to set up a hive metastore database backend that 7 | # is not supported here, you may set $metastore_databse to undef. 8 | # 9 | # You must separately ensure that your $metastore_database (e.g. mysql) package 10 | # is installed. 11 | # 12 | # == Parameters 13 | # $metastore_database - Name of metastore database to use. This should be 14 | # the name of a cdh4::hive::metastore::* class in 15 | # hive/metastore/*.pp. 16 | # 17 | class cdh4::hive::master($metastore_database = 'mysql') { 18 | class { 'cdh4::hive::server': } 19 | class { 'cdh4::hive::metastore': } 20 | 21 | # Set up the metastore database by including 22 | # the $metastore_database_class. 23 | $metastore_database_class = "cdh4::hive::metastore::${metastore_database}" 24 | if ($metastore_database) { 25 | class { $metastore_database_class: } 26 | } 27 | 28 | # Make sure the $metastore_database_class is included and set up 29 | # before we start the hive-metastore service 30 | Class[$metastore_database_class] -> Class['cdh4::hive::metastore'] 31 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hive/metastore.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hive::metastore 2 | # 3 | class cdh4::hive::metastore 4 | { 5 | Class['cdh4::hive'] -> Class['cdh4::hive::metastore'] 6 | 7 | package { 'hive-metastore': 8 | ensure => 'installed', 9 | } 10 | 11 | service { 'hive-metastore': 12 | ensure => 'running', 13 | require => Package['hive-metastore'], 14 | hasrestart => true, 15 | hasstatus => true, 16 | } 17 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hive/server.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hive::server 2 | # Configures hive-server2. Requires that cdh4::hadoop is included so that 3 | # hadoop-client is available to create hive HDFS directories. 4 | # 5 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.0/CDH4-Installation-Guide/cdh4ig_topic_18_5.html 6 | # 7 | class cdh4::hive::server 8 | { 9 | # cdh4::hive::server requires hadoop client and configs are installed. 10 | Class['cdh4::hadoop'] -> Class['cdh4::hive::server'] 11 | Class['cdh4::hive'] -> Class['cdh4::hive::server'] 12 | 13 | package { 'hive-server2': 14 | ensure => 'installed', 15 | alias => 'hive-server', 16 | } 17 | 18 | # sudo -u hdfs hadoop fs -mkdir /user/hive 19 | # sudo -u hdfs hadoop fs -chmod 0775 /user/hive 20 | # sudo -u hdfs hadoop fs -chown hive:hadoop /user/hive 21 | cdh4::hadoop::directory { '/user/hive': 22 | owner => 'hive', 23 | group => 'hadoop', 24 | mode => '0775', 25 | require => Package['hive'], 26 | } 27 | # sudo -u hdfs hadoop fs -mkdir /user/hive/warehouse 28 | # sudo -u hdfs hadoop fs -chmod 1777 /user/hive/warehouse 29 | # sudo -u hdfs hadoop fs -chown hive:hadoop /user/hive/warehouse 30 | cdh4::hadoop::directory { '/user/hive/warehouse': 31 | owner => 'hive', 32 | group => 'hadoop', 33 | mode => '1777', 34 | require => Cdh4::Hadoop::Directory['/user/hive'], 35 | } 36 | 37 | service { 'hive-server2': 38 | ensure => 'running', 39 | require => Package['hive-server2'], 40 | hasrestart => true, 41 | hasstatus => true, 42 | } 43 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/hue/defaults.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::hue::defaults 2 | # 3 | class cdh4::hue::defaults { 4 | $http_host = '0.0.0.0' 5 | $http_port = 8888 6 | $secret_key = undef 7 | 8 | # Set Hue Oozie defaults to those already 9 | # set in the cdh4::oozie class. 10 | if (defined(Class['cdh4::oozie'])) { 11 | $oozie_url = $cdh4::oozie::url 12 | # Is this the proper default values? I'm not sure. 13 | $oozie_security_enabled = $cdh4::hue::defaults::oozie_security_enabled 14 | } 15 | # Otherwise disable Oozie interface for Hue. 16 | else { 17 | $oozie_url = undef 18 | $oozie_security_enabled = undef 19 | } 20 | 21 | $smtp_host = 'localhost' 22 | $smtp_port = 25 23 | $smtp_user = undef 24 | $smtp_password = undef 25 | $smtp_from_email = undef 26 | 27 | $ssl_private_key = '/etc/ssl/private/hue.key' 28 | $ssl_certificate = '/etc/ssl/certs/hue.cert' 29 | 30 | # if httpfs is enabled, the default httpfs port 31 | # will be used, instead of the webhdfs port. 32 | $httpfs_enabled = false 33 | 34 | $ldap_url = undef 35 | $ldap_cert = undef 36 | $ldap_nt_domain = undef 37 | $ldap_bind_dn = undef 38 | $ldap_base_dn = undef 39 | $ldap_bind_password = undef 40 | $ldap_username_pattern = undef 41 | $ldap_user_filter = undef 42 | $ldap_user_name_attr = undef 43 | $ldap_group_filter = undef 44 | $ldap_group_name_attr = undef 45 | $ldap_group_member_attr = undef 46 | 47 | $hue_ini_template = 'cdh4/hue/hue.ini.erb' 48 | 49 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/oozie.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::oozie 2 | # Installs the oozie-client package 3 | # And sets OOZIE_URL in /etc/profile.d/oozie.sh. 4 | # 5 | class cdh4::oozie( 6 | $oozie_host = 'localhost' 7 | ) 8 | { 9 | # oozie server url 10 | $url = "http://$oozie_host:11000/oozie" 11 | 12 | package { 'oozie-client': 13 | ensure => 'installed', 14 | } 15 | 16 | # create a file in /etc/profile.d to export OOZIE_URL. 17 | file { '/etc/profile.d/oozie.sh': 18 | content => "# NOTE: This file is managed by Puppet. 19 | 20 | export OOZIE_URL='${url}' 21 | ", 22 | mode => '0444', 23 | } 24 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/oozie/database/mysql.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::oozie::database::mysql 2 | # Configures and sets up a MySQL database for Oozie. 3 | # 4 | # Note that this class does not support running 5 | # the Oozie database on a different host than where your 6 | # oozie server will run. Permissions will only be granted 7 | # for localhost MySQL users, so oozie server must run on this node. 8 | # 9 | # Also, root must be able to run /usr/bin/mysql with no password and have permissions 10 | # to create databases and users and grant permissions. 11 | # 12 | # You probably shouldn't be including this class directly. Instead, include 13 | # cdh4::oozie::server with database => 'mysql'. 14 | # 15 | # See: http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/4.2.1/CDH4-Installation-Guide/cdh4ig_topic_17_6.html 16 | # 17 | class cdh4::oozie::database::mysql { 18 | if (!defined(Package['libmysql-java'])) { 19 | package { 'libmysql-java': 20 | ensure => 'installed', 21 | } 22 | } 23 | 24 | # symlink mysql.jar into /var/lib/oozie 25 | file { '/var/lib/oozie/mysql.jar': 26 | ensure => 'link', 27 | target => '/usr/share/java/mysql.jar', 28 | require => Package['libmysql-java'], 29 | } 30 | 31 | $db_name = $cdh4::oozie::server::jdbc_database 32 | $db_user = $cdh4::oozie::server::jdbc_username 33 | $db_pass = $cdh4::oozie::server::jdbc_password 34 | 35 | # oozie is going to need an oozie database and user. 36 | exec { 'oozie_mysql_create_database': 37 | command => "/usr/bin/mysql -e \" 38 | CREATE DATABASE ${db_name}; 39 | GRANT ALL PRIVILEGES ON ${db_name}.* TO '${db_user}'@'localhost' IDENTIFIED BY '${db_pass}'; 40 | GRANT ALL PRIVILEGES ON ${db_name}.* TO '${db_user}'@'127.0.0.1' IDENTIFIED BY '${db_pass}';\"", 41 | unless => "/usr/bin/mysql -BNe 'SHOW DATABASES' | /bin/grep -q ${db_name}", 42 | user => 'root', 43 | } 44 | 45 | # run ooziedb.sh to create the oozie database schema 46 | exec { 'oozie_mysql_create_schema': 47 | command => '/usr/lib/oozie/bin/ooziedb.sh create -run', 48 | require => [Exec['oozie_mysql_create_database'], File['/var/lib/oozie/mysql.jar']], 49 | unless => "/usr/bin/mysql -u${db_user} -p'${db_pass}' ${db_name} -BNe 'SHOW TABLES;' | /bin/grep -q OOZIE_SYS", 50 | user => 'oozie', 51 | } 52 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/oozie/defaults.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::oozie::defaults 2 | # 3 | class cdh4::oozie::defaults { 4 | $database = 'mysql' 5 | 6 | $jdbc_driver = 'com.mysql.jdbc.Driver' 7 | $jdbc_protocol = 'mysql' 8 | $jdbc_database = 'oozie' 9 | $jdbc_host = 'localhost' 10 | $jdbc_port = 3306 11 | $jdbc_username = 'oozie' 12 | $jdbc_password = 'oozie' 13 | 14 | $smtp_host = undef 15 | $smtp_port = 25 16 | $smtp_from_email = undef 17 | $smtp_username = undef 18 | $smtp_password = undef 19 | 20 | $authorization_service_security_enabled = true 21 | 22 | # Default puppet paths to template config files. 23 | # This allows us to use custom template config files 24 | # if we want to override more settings than this 25 | # module yet supports. 26 | $oozie_site_template = 'cdh4/oozie/oozie-site.xml.erb' 27 | $oozie_env_template = 'cdh4/oozie/oozie-env.sh.erb' 28 | } 29 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/pig.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::pig 2 | # 3 | # Installs and configures Apache Pig. 4 | # 5 | class cdh4::pig { 6 | package { 'pig': 7 | ensure => 'installed', 8 | } 9 | 10 | file { '/etc/pig/conf/pig.properties': 11 | content => template('cdh4/pig/pig.properties.erb'), 12 | require => Package['pig'], 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/manifests/sqoop.pp: -------------------------------------------------------------------------------- 1 | # == Class cdh4::sqoop 2 | # Installs Sqoop 3 | class cdh4::sqoop { 4 | package { 'sqoop': 5 | ensure => 'installed', 6 | } 7 | 8 | if (!defined(Package['libmysql-java'])) { 9 | package { 'libmysql-java': 10 | ensure => 'installed', 11 | } 12 | } 13 | # symlink the mysql-connector-java.jar that is installed by 14 | # libmysql-java into /usr/lib/sqoop/lib 15 | # TODO: Can I create this symlink as mysql.jar? 16 | file { '/usr/lib/sqoop/lib/mysql-connector-java.jar': 17 | ensure => 'link', 18 | target => '/usr/share/java/mysql-connector-java.jar', 19 | require => [Package['sqoop'], Package['libmysql-java']], 20 | } 21 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/hadoop/core-site.xml.erb: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | <%= @use_yarn ? 'fs.defaultFS' : 'fs.default.name' %> 12 | hdfs://<%= @ha_enabled ? @nameservice_id : @primary_namenode_host %>/ 13 | 14 | 15 | <% if @io_file_buffer_size -%> 16 | 17 | io.file.buffer.size 18 | <%= io_file_buffer_size %> 19 | 20 | <% end -%> 21 | 22 | <% if enable_webhdfs -%> 23 | <% # NOTE: There might be a better way to 24 | # conditionally set this rather than relying 25 | # the enable_webhdfs setting. This will do for now. 26 | -%> 27 | 28 | 29 | hadoop.proxyuser.hue.hosts 30 | * 31 | 32 | 33 | hadoop.proxyuser.hue.groups 34 | * 35 | 36 | 37 | 38 | 39 | hadoop.proxyuser.oozie.hosts 40 | * 41 | 42 | 43 | hadoop.proxyuser.oozie.groups 44 | * 45 | 46 | <% end -%> 47 | 48 | <% if @net_topology_script_template -%> 49 | 50 | 51 | net.topology.script.file.name 52 | <%= @net_topology_script_path %> 53 | 54 | <% end -%> 55 | 56 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/hadoop/hadoop-env.sh.erb: -------------------------------------------------------------------------------- 1 | # Note: This file is managed by Puppet. 2 | 3 | <% if use_yarn == true -%> 4 | # Use YARN for all hadoop commands 5 | export HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce 6 | <% else -%> 7 | export HADOOP_MAPRED_HOME=/usr/lib/hadoop-0.20-mapreduce 8 | <% end -%> 9 | 10 | <% if @namenode_jmxremote_port -%> 11 | # Enable NameNode JMX connections on port <%= namenode_jmxremote_port %> 12 | HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=<%= namenode_jmxremote_port %> -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" 13 | <% end -%> 14 | 15 | <% if @datanode_jmxremote_port -%> 16 | # Enable DateNode JMX connections on port <%= datanode_jmxremote_port %> 17 | HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote.port=<%= datanode_jmxremote_port %> -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" 18 | <% end -%> 19 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/hadoop/hadoop-metrics2.properties.erb: -------------------------------------------------------------------------------- 1 | # NOTE: This file is managed by Puppet. 2 | 3 | # syntax: [prefix].[source|sink].[instance].[options] 4 | # See javadoc of package-info.java for org.apache.hadoop.metrics2 for details 5 | 6 | # default sampling period, in seconds 7 | *.period=10 8 | 9 | <% if @ganglia_hosts 10 | ganglia_hosts_string = ganglia_hosts.sort.join(',') 11 | -%> 12 | # 13 | # Below are for sending metrics to Ganglia 14 | # 15 | 16 | # for Ganglia 3.1 support 17 | *.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31 18 | 19 | *.sink.ganglia.period=10 20 | 21 | # default for supportsparse is false 22 | # *.sink.ganglia.supportsparse=true 23 | 24 | *.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both 25 | *.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40 26 | 27 | namenode.sink.ganglia.servers=<%= ganglia_hosts_string %> 28 | datanode.sink.ganglia.servers=<%= ganglia_hosts_string %> 29 | 30 | <% if use_yarn -%> 31 | resourcemanager.sink.ganglia.servers=<%= ganglia_hosts_string %> 32 | nodemanager.sink.ganglia.servers=<%= ganglia_hosts_string %> 33 | <% else -%> 34 | jobtracker.sink.ganglia.servers=<%= ganglia_hosts_string %> 35 | tasktracker.sink.ganglia.servers=<%= ganglia_hosts_string %> 36 | <% end -%> 37 | 38 | maptask.sink.ganglia.servers=<%= ganglia_hosts_string %> 39 | reducetask.sink.ganglia.servers=<%= ganglia_hosts_string %> 40 | 41 | secondarynamenode.sink.ganglia.servers=<%= ganglia_hosts_string %> 42 | 43 | <% end -%> -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/hadoop/httpfs-site.xml.erb: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <% if enable_webhdfs -%> 10 | <% # NOTE: There might be a better way to 11 | # conditionally set this rather than relying 12 | # the enable_webhdfs setting. This will do for now. 13 | -%> 14 | 15 | 16 | httpfs.proxyuser.hue.hosts 17 | * 18 | 19 | 20 | httpfs.proxyuser.hue.groups 21 | * 22 | 23 | 24 | 25 | 26 | httpfs.proxyuser.oozie.hosts 27 | * 28 | 29 | 30 | httpfs.proxyuser.oozie.groups 31 | * 32 | 33 | <% end -%> 34 | 35 | 36 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/hive/hive-exec-log4j.properties.erb: -------------------------------------------------------------------------------- 1 | hive.log.threshold=INFO 2 | hive.root.logger=INFO,RFA 3 | hive.log.dir=/var/log/hive 4 | hive.log.file=${hive.query.id}.log 5 | 6 | # Define the root logger to the system property "hive.root.logger". 7 | log4j.rootLogger=${hive.root.logger}, EventCounter 8 | 9 | # Logging Threshold 10 | log4j.threshhold=${hive.log.threshold} 11 | 12 | # 13 | # Rolling File Appender - cap space usage at 512MB 14 | # 15 | hive.log.maxfilesize=256MB 16 | hive.log.maxbackupindex=2 17 | log4j.appender.RFA=org.apache.log4j.RollingFileAppender 18 | log4j.appender.RFA.File=${hive.log.dir}/${hive.log.file} 19 | log4j.appender.RFA.MaxFileSize=${hive.log.maxfilesize} 20 | log4j.appender.RFA.MaxBackupIndex=${hive.log.maxbackupindex} 21 | log4j.appender.RFA.layout=org.apache.log4j.PatternLayout 22 | # Pattern format: Date LogLevel LoggerName LogMessage 23 | log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 24 | 25 | # 26 | # Event Counter Appender 27 | # Sends counts of logging messages at different severity levels to Hadoop Metrics. 28 | # 29 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter 30 | 31 | log4j.category.DataNucleus=ERROR,RFA 32 | log4j.category.Datastore=ERROR,RFA 33 | log4j.category.Datastore.Schema=ERROR,RFA 34 | log4j.category.JPOX.Datastore=ERROR,RFA 35 | log4j.category.JPOX.Plugin=ERROR,RFA 36 | log4j.category.JPOX.MetaData=ERROR,RFA 37 | log4j.category.JPOX.Query=ERROR,RFA 38 | log4j.category.JPOX.General=ERROR,RFA 39 | log4j.category.JPOX.Enhancer=ERROR,RFA 40 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/oozie/oozie-env.sh.erb: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Note: This file is managed by Puppet. 4 | 5 | export OOZIE_CONFIG=/etc/oozie/conf 6 | export OOZIE_DATA=/var/lib/oozie 7 | export OOZIE_LOG=/var/log/oozie 8 | export OOZIE_CATALINA_HOME=/usr/lib/bigtop-tomcat 9 | export CATALINA_TMPDIR=/var/lib/oozie 10 | export CATALINA_PID=/var/run/oozie/oozie.pid 11 | export CATALINA_BASE=<%= @catalina_base %> 12 | export CATALINA_OPTS=-Xmx1024m 13 | <% 14 | # This puppet module doesn't (yet) support HTTPS configuration. 15 | # These are the defaults that ship with CDH4. 16 | -%> 17 | export OOZIE_HTTPS_PORT=11443 18 | export OOZIE_HTTPS_KEYSTORE_PASS=password 19 | export CATALINA_OPTS="$CATALINA_OPTS -Doozie.https.port=${OOZIE_HTTPS_PORT}" 20 | export CATALINA_OPTS="$CATALINA_OPTS -Doozie.https.keystore.pass=${OOZIE_HTTPS_KEYSTORE_PASS}" 21 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/templates/pig/pig.properties.erb: -------------------------------------------------------------------------------- 1 | # Pig configuration file. All values can be overwritten by command line arguments. 2 | # see bin/pig -help 3 | 4 | # log4jconf log4j configuration file 5 | # log4jconf=./conf/log4j.properties 6 | 7 | # brief logging (no timestamps) 8 | brief=false 9 | 10 | # clustername, name of the hadoop jobtracker. If no port is defined port 50020 will be used. 11 | #cluster 12 | 13 | #debug level, INFO is default 14 | debug=INFO 15 | 16 | # a file that contains pig script 17 | #file= 18 | 19 | # load jarfile, colon separated 20 | #jar= 21 | 22 | #verbose print all log messages to screen (default to print only INFO and above to screen) 23 | verbose=false 24 | 25 | #exectype local|mapreduce, mapreduce is default 26 | #exectype=mapreduce 27 | # hod realted properties 28 | #ssh.gateway 29 | #hod.expect.root 30 | #hod.expect.uselatest 31 | #hod.command 32 | #hod.config.dir 33 | #hod.param 34 | 35 | 36 | #Do not spill temp files smaller than this size (bytes) 37 | pig.spill.size.threshold=5000000 38 | #EXPERIMENT: Activate garbage collection when spilling a file bigger than this size (bytes) 39 | #This should help reduce the number of files being spilled. 40 | pig.spill.gc.activation.size=40000000 41 | 42 | 43 | ###################### 44 | # Everything below this line is Yahoo specific. Note that I've made 45 | # (almost) no changes to the lines above to make merging in from Apache 46 | # easier. Any values I don't want from above I override below. 47 | # 48 | # This file is configured for use with HOD on the production clusters. If you 49 | # want to run pig with a static cluster you will need to remove everything 50 | # below this line and set the cluster value (above) to the 51 | # hostname and port of your job tracker. 52 | 53 | exectype=mapreduce 54 | log.file= 55 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/Makefile: -------------------------------------------------------------------------------- 1 | MANIFESTS=$(wildcard *.pp) 2 | OBJS=$(MANIFESTS:.pp=.po) 3 | TESTS_DIR=$(dir $(CURDIR)) 4 | MODULE_DIR=$(TESTS_DIR:/=) 5 | MODULES_DIR=$(dir $(MODULE_DIR)) 6 | 7 | all: test 8 | 9 | test: $(OBJS) 10 | 11 | %.po: %.pp 12 | puppet apply --noop --modulepath $(MODULES_DIR) $< -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/datanode.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | include cdh4::hadoop::datanode 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/defaults.pp: -------------------------------------------------------------------------------- 1 | # 2 | include cdh4::hadoop::defaults 3 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hadoop.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/historyserver.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | # historyserver requires namenode 9 | include cdh4::hadoop::master 10 | include cdh4::hadoop::historyserver 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hive.pp: -------------------------------------------------------------------------------- 1 | $fqdn = 'hive1.domain.org' 2 | class { 'cdh4::hive': 3 | metastore_host => $fqdn, 4 | zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'], 5 | jdbc_password => 'test', 6 | } -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hive_master.pp: -------------------------------------------------------------------------------- 1 | $fqdn = 'hive1.domain.org' 2 | class { '::cdh4::hadoop': 3 | namenode_hosts => ['localhost'], 4 | dfs_name_dir => '/var/lib/hadoop/name', 5 | } 6 | 7 | class { 'cdh4::hive': 8 | metastore_host => $fqdn, 9 | zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'], 10 | jdbc_password => 'test', 11 | } 12 | class { 'cdh4::hive::master': } 13 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hive_metastore.pp: -------------------------------------------------------------------------------- 1 | $fqdn = 'hive1.domain.org' 2 | class { 'cdh4::hive': 3 | metastore_host => $fqdn, 4 | zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'], 5 | jdbc_password => 'test', 6 | } 7 | class { 'cdh4::hive::metastore': } 8 | 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hive_metastore_mysql.pp: -------------------------------------------------------------------------------- 1 | $fqdn = 'hive1.domain.org' 2 | class { 'cdh4::hive': 3 | metastore_host => $fqdn, 4 | zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'], 5 | jdbc_password => 'test', 6 | } 7 | class { 'cdh4::hive::metastore::mysql': } 8 | 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/hive_server.pp: -------------------------------------------------------------------------------- 1 | $fqdn = 'hive1.domain.org' 2 | class { '::cdh4::hadoop': 3 | namenode_hosts => ['localhost'], 4 | dfs_name_dir => '/var/lib/hadoop/name', 5 | } 6 | 7 | class { 'cdh4::hive': 8 | metastore_host => $fqdn, 9 | zookeeper_hosts => ['zk1.domain.org', 'zk2.domain.org'], 10 | jdbc_password => 'test', 11 | } 12 | class { 'cdh4::hive::server': } 13 | 14 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/jobtracker.pp: -------------------------------------------------------------------------------- 1 | 2 | class { '::cdh4::hadoop': 3 | use_yarn => false, 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | # jobtracker requires namenode 9 | include cdh4::hadoop::master 10 | include cdh4::hadoop::jobtracker 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/master.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | include cdh4::hadoop::master 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/namenode.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | include cdh4::hadoop::namenode 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/namenode_primary.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost', 'nonya'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | nameservice_id => 'test-cdh4', 7 | journalnode_hosts => ['localhost', 'nonya'], 8 | } 9 | 10 | include cdh4::hadoop::namenode::primary 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/namenode_standby.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost', 'nonya'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | nameservice_id => 'test-cdh4', 7 | journalnode_hosts => ['localhost', 'nonya'], 8 | } 9 | 10 | include cdh4::hadoop::namenode::standby 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/nodemanager.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | include cdh4::hadoop::nodemanager 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/pig.pp: -------------------------------------------------------------------------------- 1 | include cdh4::pig -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/resourcemanager.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => ['localhost'], 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | } 7 | 8 | # resourcemanager requires namenode 9 | include cdh4::hadoop::master 10 | include cdh4::hadoop::resourcemanager 11 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/sqoop.pp: -------------------------------------------------------------------------------- 1 | include cdh4::sqoop -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/tasktracker.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | use_yarn => false, 5 | namenode_hosts => ['localhost'], 6 | dfs_name_dir => '/var/lib/hadoop/name', 7 | } 8 | 9 | include cdh4::hadoop::tasktracker 10 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/cdh4/tests/worker.pp: -------------------------------------------------------------------------------- 1 | # 2 | 3 | class { '::cdh4::hadoop': 4 | namenode_hosts => 'localhost', 5 | dfs_name_dir => '/var/lib/hadoop/name', 6 | datanode_mounts => '/tmp', 7 | } 8 | 9 | include cdh4::hadoop::worker 10 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Stefan van Wouw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/README.md: -------------------------------------------------------------------------------- 1 | # Puppet module for Spark (0.9.0) 2 | 3 | Puppet module to install Spark (0.9.0) on your Hadoop cluster. 4 | 5 | 6 | Unfortunately no Debian packages are available for Spark, and the pre-compiled Spark versions are not compatible with CDH 4.4.0. 7 | Therefore I built the Spark incubator version 0.9.0 and included the entire dist directory in the puppet module. 8 | 9 | If you want to deploy another version of Spark use the following code to compile (e.g. older Spark 0.8.0): 10 | 11 | 12 | ```bash 13 | wget https://github.com/apache/incubator-spark/archive/v0.8.0-incubating.tar.gz 14 | tar xvf v0.8.0-incubating.tar.gz 15 | cd incubator-spark-0.8.0-incubating/ 16 | ./make-distribution.sh --hadoop 2.0.0-cdh4.4.0 17 | cp conf/log4j.properties.template dist/conf/log4j.properties 18 | 19 | # Replace the standard distribution with the one you just compiled: 20 | rm -rf /etc/puppet/modules/spark/files/spark 21 | cp -r dist /etc/puppet/modules/spark/files/spark 22 | 23 | ``` 24 | 25 | *Note: Spark 0.8.0 does not compile with YARN enabled against YARN CDH4.4.0.* 26 | 27 | 28 | ### Dependencies not made explicit in the module itself: 29 | 30 | 31 | - Oracle Java 6 (7 for Spark 0.9.0+) installed on all nodes (requirement of Spark). 32 | - Apache HDFS should be installed (The CDH4 versions included in: https://github.com/wikimedia/puppet-cdh4 ). 33 | - OS should be Ubuntu/Debian for package dependencies. 34 | 35 | ### Usage: 36 | 37 | 38 | On the master node: 39 | ```puppet 40 | class {'spark::master': 41 | worker_mem => 'worker memory e.g. 60g', 42 | require => [ 43 | Class['your::class::that::ensures::java::is::installed'], 44 | Class['cdh4::hadoop'] 45 | ], 46 | } 47 | ``` 48 | 49 | On the worker nodes: 50 | ```puppet 51 | class {'spark::worker': 52 | master => $master_fqdn, 53 | memory => 'worker memory e.g. 60g', 54 | require => [ 55 | Class['your::class::that::ensures::java::is::installed'], 56 | Class['cdh4::hadoop'] 57 | ], 58 | } 59 | ``` 60 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/RELEASE: -------------------------------------------------------------------------------- 1 | Spark 1.0.0 built for Hadoop 2.0.0-cdh4.7.0 2 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/load-spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script loads spark-env.sh if it exists, and ensures it is only loaded once. 21 | # spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's 22 | # conf/ subdirectory. 23 | 24 | if [ -z "$SPARK_ENV_LOADED" ]; then 25 | export SPARK_ENV_LOADED=1 26 | 27 | # Returns the parent of the directory this script lives in. 28 | parent_dir="$(cd `dirname $0`/..; pwd)" 29 | 30 | use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"} 31 | 32 | if [ -f "${use_conf_dir}/spark-env.sh" ]; then 33 | # Promote all variable declarations to environment (exported) variables 34 | set -a 35 | . "${use_conf_dir}/spark-env.sh" 36 | set +a 37 | fi 38 | fi 39 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/pyspark.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | rem This is the entry point for running PySpark. To avoid polluting the 21 | rem environment, it just launches a new cmd to do the real work. 22 | 23 | cmd /V /E /C %~dp0pyspark2.cmd %* 24 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/pyspark2.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | set SCALA_VERSION=2.10 21 | 22 | rem Figure out where the Spark framework is installed 23 | set FWDIR=%~dp0..\ 24 | 25 | rem Export this as SPARK_HOME 26 | set SPARK_HOME=%FWDIR% 27 | 28 | rem Test whether the user has built Spark 29 | if exist "%FWDIR%RELEASE" goto skip_build_test 30 | set FOUND_JAR=0 31 | for %%d in ("%FWDIR%assembly\target\scala-%SCALA_VERSION%\spark-assembly*hadoop*.jar") do ( 32 | set FOUND_JAR=1 33 | ) 34 | if [%FOUND_JAR%] == [0] ( 35 | echo Failed to find Spark assembly JAR. 36 | echo You need to build Spark with sbt\sbt assembly before running this program. 37 | goto exit 38 | ) 39 | :skip_build_test 40 | 41 | rem Load environment variables from conf\spark-env.cmd, if it exists 42 | if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" 43 | 44 | rem Figure out which Python to use. 45 | if [%PYSPARK_PYTHON%] == [] set PYSPARK_PYTHON=python 46 | 47 | set PYTHONPATH=%FWDIR%python;%PYTHONPATH% 48 | set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH% 49 | 50 | set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% 51 | set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py 52 | set PYSPARK_SUBMIT_ARGS=%* 53 | 54 | echo Running %PYSPARK_PYTHON% with PYTHONPATH=%PYTHONPATH% 55 | 56 | rem Check whether the argument is a file 57 | for /f %%i in ('echo %1^| findstr /R "\.py"') do ( 58 | set PYTHON_FILE=%%i 59 | ) 60 | 61 | if [%PYTHON_FILE%] == [] ( 62 | %PYSPARK_PYTHON% 63 | ) else ( 64 | echo. 65 | echo WARNING: Running python applications through ./bin/pyspark.cmd is deprecated as of Spark 1.0. 66 | echo Use ./bin/spark-submit ^ 67 | echo. 68 | "%FWDIR%\bin\spark-submit.cmd" %PYSPARK_SUBMIT_ARGS% 69 | ) 70 | 71 | :exit 72 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/run-example: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | SCALA_VERSION=2.10 21 | 22 | FWDIR="$(cd `dirname $0`/..; pwd)" 23 | export SPARK_HOME="$FWDIR" 24 | EXAMPLES_DIR="$FWDIR"/examples 25 | 26 | if [ -n "$1" ]; then 27 | EXAMPLE_CLASS="$1" 28 | shift 29 | else 30 | echo "Usage: ./bin/run-example [example-args]" 31 | echo " - set MASTER=XX to use a specific master" 32 | echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 33 | exit 1 34 | fi 35 | 36 | if [ -f "$FWDIR/RELEASE" ]; then 37 | export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar` 38 | elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then 39 | export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar` 40 | fi 41 | 42 | if [[ -z $SPARK_EXAMPLES_JAR ]]; then 43 | echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2 44 | echo "You need to build Spark before running this program" >&2 45 | exit 1 46 | fi 47 | 48 | EXAMPLE_MASTER=${MASTER:-"local[*]"} 49 | 50 | if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then 51 | EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS" 52 | fi 53 | 54 | ./bin/spark-submit \ 55 | --master $EXAMPLE_MASTER \ 56 | --class $EXAMPLE_CLASS \ 57 | "$SPARK_EXAMPLES_JAR" \ 58 | "$@" 59 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/run-example.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | rem This is the entry point for running a Spark example. To avoid polluting 21 | rem the environment, it just launches a new cmd to do the real work. 22 | 23 | cmd /V /E /C %~dp0run-example2.cmd %* 24 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/spark-class.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | rem This is the entry point for running a Spark class. To avoid polluting 21 | rem the environment, it just launches a new cmd to do the real work. 22 | 23 | cmd /V /E /C %~dp0spark-class2.cmd %* 24 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/spark-shell.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | set SPARK_HOME=%~dp0.. 21 | 22 | cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell %* --class org.apache.spark.repl.Main 23 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/spark-submit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | export SPARK_HOME="$(cd `dirname $0`/..; pwd)" 21 | ORIG_ARGS=("$@") 22 | 23 | while (($#)); do 24 | if [ "$1" = "--deploy-mode" ]; then 25 | DEPLOY_MODE=$2 26 | elif [ "$1" = "--driver-memory" ]; then 27 | DRIVER_MEMORY=$2 28 | elif [ "$1" = "--driver-library-path" ]; then 29 | export SPARK_SUBMIT_LIBRARY_PATH=$2 30 | elif [ "$1" = "--driver-class-path" ]; then 31 | export SPARK_SUBMIT_CLASSPATH=$2 32 | elif [ "$1" = "--driver-java-options" ]; then 33 | export SPARK_SUBMIT_OPTS=$2 34 | fi 35 | shift 36 | done 37 | 38 | DEPLOY_MODE=${DEPLOY_MODE:-"client"} 39 | 40 | if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then 41 | export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY 42 | fi 43 | 44 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}" 45 | 46 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/bin/spark-submit.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | set SPARK_HOME=%~dp0.. 21 | set ORIG_ARGS=%* 22 | 23 | rem Clear the values of all variables used 24 | set DEPLOY_MODE= 25 | set DRIVER_MEMORY= 26 | set SPARK_SUBMIT_LIBRARY_PATH= 27 | set SPARK_SUBMIT_CLASSPATH= 28 | set SPARK_SUBMIT_OPTS= 29 | set SPARK_DRIVER_MEMORY= 30 | 31 | :loop 32 | if [%1] == [] goto continue 33 | if [%1] == [--deploy-mode] ( 34 | set DEPLOY_MODE=%2 35 | ) else if [%1] == [--driver-memory] ( 36 | set DRIVER_MEMORY=%2 37 | ) else if [%1] == [--driver-library-path] ( 38 | set SPARK_SUBMIT_LIBRARY_PATH=%2 39 | ) else if [%1] == [--driver-class-path] ( 40 | set SPARK_SUBMIT_CLASSPATH=%2 41 | ) else if [%1] == [--driver-java-options] ( 42 | set SPARK_SUBMIT_OPTS=%2 43 | ) 44 | shift 45 | goto loop 46 | :continue 47 | 48 | if [%DEPLOY_MODE%] == [] ( 49 | set DEPLOY_MODE=client 50 | ) 51 | 52 | if not [%DRIVER_MEMORY%] == [] if [%DEPLOY_MODE%] == [client] ( 53 | set SPARK_DRIVER_MEMORY=%DRIVER_MEMORY% 54 | ) 55 | 56 | cmd /V /E /C %SPARK_HOME%\bin\spark-class.cmd org.apache.spark.deploy.SparkSubmit %ORIG_ARGS% 57 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/conf/fairscheduler.xml.template: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | FAIR 5 | 1 6 | 2 7 | 8 | 9 | FIFO 10 | 2 11 | 3 12 | 13 | 14 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/conf/log4j.properties.template: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 12 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/conf/slaves: -------------------------------------------------------------------------------- 1 | # A Spark Worker will be started on each of the machines listed below. 2 | localhost -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/conf/spark-defaults.conf.template: -------------------------------------------------------------------------------- 1 | # Default system properties included when running spark-submit. 2 | # This is useful for setting default environmental settings. 3 | 4 | # Example: 5 | # spark.master spark://master:7077 6 | # spark.eventLog.enabled true 7 | # spark.eventLog.dir hdfs://namenode:8021/directory 8 | # spark.serializer org.apache.spark.serializer.KryoSerializer 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/ec2/README: -------------------------------------------------------------------------------- 1 | This folder contains a script, spark-ec2, for launching Spark clusters on 2 | Amazon EC2. Usage instructions are available online at: 3 | 4 | http://spark.apache.org/docs/latest/ec2-scripts.html 5 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # These variables are automatically filled in by the spark-ec2 script. 21 | export MASTERS="{{master_list}}" 22 | export SLAVES="{{slave_list}}" 23 | export HDFS_DATA_DIRS="{{hdfs_data_dirs}}" 24 | export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" 25 | export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" 26 | export MODULES="{{modules}}" 27 | export SPARK_VERSION="{{spark_version}}" 28 | export SHARK_VERSION="{{shark_version}}" 29 | export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" 30 | export SWAP_MB="{{swap}}" 31 | export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" 32 | export SPARK_MASTER_OPTS="{{spark_master_opts}}" 33 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/ec2/spark-ec2: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one 5 | # or more contributor license agreements. See the NOTICE file 6 | # distributed with this work for additional information 7 | # regarding copyright ownership. The ASF licenses this file 8 | # to you under the Apache License, Version 2.0 (the 9 | # "License"); you may not use this file except in compliance 10 | # with the License. You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | cd "`dirname $0`" 22 | PYTHONPATH="./third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" python ./spark_ec2.py $@ 23 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/ec2/third_party/boto-2.4.1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/ec2/third_party/boto-2.4.1.zip -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples; 19 | 20 | import org.apache.spark.SparkConf; 21 | import org.apache.spark.api.java.JavaRDD; 22 | import org.apache.spark.api.java.JavaSparkContext; 23 | import org.apache.spark.api.java.function.Function; 24 | import org.apache.spark.api.java.function.Function2; 25 | 26 | import java.util.ArrayList; 27 | import java.util.List; 28 | 29 | /** 30 | * Computes an approximation to pi 31 | * Usage: JavaSparkPi [slices] 32 | */ 33 | public final class JavaSparkPi { 34 | 35 | 36 | public static void main(String[] args) throws Exception { 37 | SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi"); 38 | JavaSparkContext jsc = new JavaSparkContext(sparkConf); 39 | 40 | int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2; 41 | int n = 100000 * slices; 42 | List l = new ArrayList(n); 43 | for (int i = 0; i < n; i++) { 44 | l.add(i); 45 | } 46 | 47 | JavaRDD dataSet = jsc.parallelize(l, slices); 48 | 49 | int count = dataSet.map(new Function() { 50 | @Override 51 | public Integer call(Integer integer) { 52 | double x = Math.random() * 2 - 1; 53 | double y = Math.random() * 2 - 1; 54 | return (x * x + y * y < 1) ? 1 : 0; 55 | } 56 | }).reduce(new Function2() { 57 | @Override 58 | public Integer call(Integer integer, Integer integer2) { 59 | return integer + integer2; 60 | } 61 | }); 62 | 63 | System.out.println("Pi is roughly " + 4.0 * count / n); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/kmeans.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | The K-means algorithm written from scratch against PySpark. In practice, 20 | one may prefer to use the KMeans algorithm in MLlib, as shown in 21 | examples/src/main/python/mllib/kmeans.py. 22 | 23 | This example requires NumPy (http://www.numpy.org/). 24 | """ 25 | 26 | import sys 27 | 28 | import numpy as np 29 | from pyspark import SparkContext 30 | 31 | 32 | def parseVector(line): 33 | return np.array([float(x) for x in line.split(' ')]) 34 | 35 | 36 | def closestPoint(p, centers): 37 | bestIndex = 0 38 | closest = float("+inf") 39 | for i in range(len(centers)): 40 | tempDist = np.sum((p - centers[i]) ** 2) 41 | if tempDist < closest: 42 | closest = tempDist 43 | bestIndex = i 44 | return bestIndex 45 | 46 | 47 | if __name__ == "__main__": 48 | if len(sys.argv) != 4: 49 | print >> sys.stderr, "Usage: kmeans " 50 | exit(-1) 51 | sc = SparkContext(appName="PythonKMeans") 52 | lines = sc.textFile(sys.argv[1]) 53 | data = lines.map(parseVector).cache() 54 | K = int(sys.argv[2]) 55 | convergeDist = float(sys.argv[3]) 56 | 57 | kPoints = data.takeSample(False, K, 1) 58 | tempDist = 1.0 59 | 60 | while tempDist > convergeDist: 61 | closest = data.map( 62 | lambda p: (closestPoint(p, kPoints), (p, 1))) 63 | pointStats = closest.reduceByKey( 64 | lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2)) 65 | newPoints = pointStats.map( 66 | lambda (x, (y, z)): (x, y / z)).collect() 67 | 68 | tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints) 69 | 70 | for (x, y) in newPoints: 71 | kPoints[x] = y 72 | 73 | print "Final centers: " + str(kPoints) 74 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/mllib/kmeans.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | A K-means clustering program using MLlib. 20 | 21 | This example requires NumPy (http://www.numpy.org/). 22 | """ 23 | 24 | import sys 25 | 26 | import numpy as np 27 | from pyspark import SparkContext 28 | from pyspark.mllib.clustering import KMeans 29 | 30 | 31 | def parseVector(line): 32 | return np.array([float(x) for x in line.split(' ')]) 33 | 34 | 35 | if __name__ == "__main__": 36 | if len(sys.argv) != 3: 37 | print >> sys.stderr, "Usage: kmeans " 38 | exit(-1) 39 | sc = SparkContext(appName="KMeans") 40 | lines = sc.textFile(sys.argv[1]) 41 | data = lines.map(parseVector) 42 | k = int(sys.argv[2]) 43 | model = KMeans.train(data, k) 44 | print "Final centers: " + str(model.clusterCenters) 45 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/mllib/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | Logistic regression using MLlib. 20 | 21 | This example requires NumPy (http://www.numpy.org/). 22 | """ 23 | 24 | from math import exp 25 | import sys 26 | 27 | import numpy as np 28 | from pyspark import SparkContext 29 | from pyspark.mllib.regression import LabeledPoint 30 | from pyspark.mllib.classification import LogisticRegressionWithSGD 31 | 32 | 33 | # Parse a line of text into an MLlib LabeledPoint object 34 | def parsePoint(line): 35 | values = [float(s) for s in line.split(' ')] 36 | if values[0] == -1: # Convert -1 labels to 0 for MLlib 37 | values[0] = 0 38 | return LabeledPoint(values[0], values[1:]) 39 | 40 | 41 | if __name__ == "__main__": 42 | if len(sys.argv) != 3: 43 | print >> sys.stderr, "Usage: logistic_regression " 44 | exit(-1) 45 | sc = SparkContext(appName="PythonLR") 46 | points = sc.textFile(sys.argv[1]).map(parsePoint) 47 | iterations = int(sys.argv[2]) 48 | model = LogisticRegressionWithSGD.train(points, iterations) 49 | print "Final weights: " + str(model.weights) 50 | print "Final intercept: " + str(model.intercept) 51 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/pagerank.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import re 19 | import sys 20 | from operator import add 21 | 22 | from pyspark import SparkContext 23 | 24 | 25 | def computeContribs(urls, rank): 26 | """Calculates URL contributions to the rank of other URLs.""" 27 | num_urls = len(urls) 28 | for url in urls: 29 | yield (url, rank / num_urls) 30 | 31 | 32 | def parseNeighbors(urls): 33 | """Parses a urls pair string into urls pair.""" 34 | parts = re.split(r'\s+', urls) 35 | return parts[0], parts[1] 36 | 37 | 38 | if __name__ == "__main__": 39 | if len(sys.argv) != 3: 40 | print >> sys.stderr, "Usage: pagerank " 41 | exit(-1) 42 | 43 | # Initialize the spark context. 44 | sc = SparkContext(appName="PythonPageRank") 45 | 46 | # Loads in input file. It should be in format of: 47 | # URL neighbor URL 48 | # URL neighbor URL 49 | # URL neighbor URL 50 | # ... 51 | lines = sc.textFile(sys.argv[1], 1) 52 | 53 | # Loads all URLs from input file and initialize their neighbors. 54 | links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache() 55 | 56 | # Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. 57 | ranks = links.map(lambda (url, neighbors): (url, 1.0)) 58 | 59 | # Calculates and updates URL ranks continuously using PageRank algorithm. 60 | for iteration in xrange(int(sys.argv[2])): 61 | # Calculates URL contributions to the rank of other URLs. 62 | contribs = links.join(ranks).flatMap( 63 | lambda (url, (urls, rank)): computeContribs(urls, rank)) 64 | 65 | # Re-calculates URL ranks based on neighbor contributions. 66 | ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15) 67 | 68 | # Collects all URL ranks and dump them to console. 69 | for (link, rank) in ranks.collect(): 70 | print "%s has rank: %s." % (link, rank) 71 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/pi.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | from random import random 20 | from operator import add 21 | 22 | from pyspark import SparkContext 23 | 24 | 25 | if __name__ == "__main__": 26 | """ 27 | Usage: pi [slices] 28 | """ 29 | sc = SparkContext(appName="PythonPi") 30 | slices = int(sys.argv[1]) if len(sys.argv) > 1 else 2 31 | n = 100000 * slices 32 | 33 | def f(_): 34 | x = random() * 2 - 1 35 | y = random() * 2 - 1 36 | return 1 if x ** 2 + y ** 2 < 1 else 0 37 | 38 | count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add) 39 | print "Pi is roughly %f" % (4.0 * count / n) 40 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/sort.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | 20 | from pyspark import SparkContext 21 | 22 | 23 | if __name__ == "__main__": 24 | if len(sys.argv) != 2: 25 | print >> sys.stderr, "Usage: sort " 26 | exit(-1) 27 | sc = SparkContext(appName="PythonSort") 28 | lines = sc.textFile(sys.argv[1], 1) 29 | sortedCount = lines.flatMap(lambda x: x.split(' ')) \ 30 | .map(lambda x: (int(x), 1)) \ 31 | .sortByKey(lambda x: x) 32 | # This is just a demo on how to bring all the sorted data back to a single node. 33 | # In reality, we wouldn't want to collect all the data to the driver node. 34 | output = sortedCount.collect() 35 | for (num, unitcount) in output: 36 | print num 37 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/transitive_closure.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | from random import Random 20 | 21 | from pyspark import SparkContext 22 | 23 | numEdges = 200 24 | numVertices = 100 25 | rand = Random(42) 26 | 27 | 28 | def generateGraph(): 29 | edges = set() 30 | while len(edges) < numEdges: 31 | src = rand.randrange(0, numEdges) 32 | dst = rand.randrange(0, numEdges) 33 | if src != dst: 34 | edges.add((src, dst)) 35 | return edges 36 | 37 | 38 | if __name__ == "__main__": 39 | """ 40 | Usage: transitive_closure [slices] 41 | """ 42 | sc = SparkContext(appName="PythonTransitiveClosure") 43 | slices = int(sys.argv[1]) if len(sys.argv) > 1 else 2 44 | tc = sc.parallelize(generateGraph(), slices).cache() 45 | 46 | # Linear transitive closure: each round grows paths by one edge, 47 | # by joining the graph's edges with the already-discovered paths. 48 | # e.g. join the path (y, z) from the TC with the edge (x, y) from 49 | # the graph to obtain the path (x, z). 50 | 51 | # Because join() joins on keys, the edges are stored in reversed order. 52 | edges = tc.map(lambda (x, y): (y, x)) 53 | 54 | oldCount = 0L 55 | nextCount = tc.count() 56 | while True: 57 | oldCount = nextCount 58 | # Perform the join, obtaining an RDD of (y, (z, x)) pairs, 59 | # then project the result to obtain the new (x, z) paths. 60 | new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a)) 61 | tc = tc.union(new_edges).distinct().cache() 62 | nextCount = tc.count() 63 | if nextCount == oldCount: 64 | break 65 | 66 | print "TC has %i edges" % tc.count() 67 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/python/wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | from operator import add 20 | 21 | from pyspark import SparkContext 22 | 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) != 2: 26 | print >> sys.stderr, "Usage: wordcount " 27 | exit(-1) 28 | sc = SparkContext(appName="PythonWordCount") 29 | lines = sc.textFile(sys.argv[1], 1) 30 | counts = lines.flatMap(lambda x: x.split(' ')) \ 31 | .map(lambda x: (x, 1)) \ 32 | .reduceByKey(add) 33 | output = counts.collect() 34 | for (word, count) in output: 35 | print "%s: %i" % (word, count) 36 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/resources/people.txt: -------------------------------------------------------------------------------- 1 | Michael, 29 2 | Andy, 30 3 | Justin, 19 4 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | 22 | /** 23 | * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize] 24 | */ 25 | object BroadcastTest { 26 | def main(args: Array[String]) { 27 | 28 | val bcName = if (args.length > 2) args(2) else "Http" 29 | val blockSize = if (args.length > 3) args(3) else "4096" 30 | 31 | System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName + 32 | "BroadcastFactory") 33 | System.setProperty("spark.broadcast.blockSize", blockSize) 34 | val sparkConf = new SparkConf().setAppName("Broadcast Test") 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val slices = if (args.length > 0) args(0).toInt else 2 39 | val num = if (args.length > 1) args(1).toInt else 1000000 40 | 41 | val arr1 = new Array[Int](num) 42 | for (i <- 0 until arr1.length) { 43 | arr1(i) = i 44 | } 45 | 46 | for (i <- 0 until 3) { 47 | println("Iteration " + i) 48 | println("===========") 49 | val startTime = System.nanoTime 50 | val barr1 = sc.broadcast(arr1) 51 | val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size) 52 | // Collect the small RDD so we can print the observed sizes locally. 53 | observedSizes.collect().foreach(i => println(i)) 54 | println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6)) 55 | } 56 | 57 | sc.stop() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.collection.JavaConversions._ 21 | 22 | /** Prints out environmental information, sleeps, and then exits. Made to 23 | * test driver submission in the standalone scheduler. */ 24 | object DriverSubmissionTest { 25 | def main(args: Array[String]) { 26 | if (args.size < 1) { 27 | println("Usage: DriverSubmissionTest ") 28 | System.exit(0) 29 | } 30 | val numSecondsToSleep = args(0).toInt 31 | 32 | val env = System.getenv() 33 | val properties = System.getProperties() 34 | 35 | println("Environment variables containing SPARK_TEST:") 36 | env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println) 37 | 38 | println("System properties containing spark.test:") 39 | properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println) 40 | 41 | for (i <- 1 until numSecondsToSleep) { 42 | println(s"Alive for $i out of $numSecondsToSleep seconds") 43 | Thread.sleep(1000) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/ExceptionHandlingTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | 22 | object ExceptionHandlingTest { 23 | def main(args: Array[String]) { 24 | val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest") 25 | val sc = new SparkContext(sparkConf) 26 | sc.parallelize(0 until sc.defaultParallelism).foreach { i => 27 | if (math.random > 0.75) { 28 | throw new Exception("Testing exception handling") 29 | } 30 | } 31 | 32 | sc.stop() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] 27 | */ 28 | object GroupByTest { 29 | def main(args: Array[String]) { 30 | val sparkConf = new SparkConf().setAppName("GroupBy Test") 31 | var numMappers = if (args.length > 0) args(0).toInt else 2 32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000 33 | var valSize = if (args.length > 2) args(2).toInt else 1000 34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 39 | val ranGen = new Random 40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs) 41 | for (i <- 0 until numKVPairs) { 42 | val byteArr = new Array[Byte](valSize) 43 | ranGen.nextBytes(byteArr) 44 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) 45 | } 46 | arr1 47 | }.cache 48 | // Enforce that everything has been calculated and in cache 49 | pairs1.count 50 | 51 | println(pairs1.groupByKey(numReducers).count) 52 | 53 | sc.stop() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.hadoop.hbase.client.HBaseAdmin 21 | import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor} 22 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 23 | 24 | import org.apache.spark._ 25 | import org.apache.spark.rdd.NewHadoopRDD 26 | 27 | object HBaseTest { 28 | def main(args: Array[String]) { 29 | val sparkConf = new SparkConf().setAppName("HBaseTest") 30 | val sc = new SparkContext(sparkConf) 31 | val conf = HBaseConfiguration.create() 32 | // Other options for configuring scan behavior are available. More information available at 33 | // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html 34 | conf.set(TableInputFormat.INPUT_TABLE, args(1)) 35 | 36 | // Initialize hBase table if necessary 37 | val admin = new HBaseAdmin(conf) 38 | if(!admin.isTableAvailable(args(1))) { 39 | val tableDesc = new HTableDescriptor(args(1)) 40 | admin.createTable(tableDesc) 41 | } 42 | 43 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], 44 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 45 | classOf[org.apache.hadoop.hbase.client.Result]) 46 | 47 | hBaseRDD.count() 48 | 49 | sc.stop() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark._ 21 | 22 | object HdfsTest { 23 | def main(args: Array[String]) { 24 | val sparkConf = new SparkConf().setAppName("HdfsTest") 25 | val sc = new SparkContext(sparkConf) 26 | val file = sc.textFile(args(1)) 27 | val mapped = file.map(s => s.length).cache() 28 | for (iter <- 1 to 10) { 29 | val start = System.currentTimeMillis() 30 | for (x <- mapped) { x + 2 } 31 | // println("Processing: " + x) 32 | val end = System.currentTimeMillis() 33 | println("Iteration " + iter + " took " + (end-start) + " ms") 34 | } 35 | sc.stop() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import breeze.linalg.{Vector, DenseVector} 23 | 24 | object LocalFileLR { 25 | val D = 10 // Numer of dimensions 26 | val rand = new Random(42) 27 | 28 | case class DataPoint(x: Vector[Double], y: Double) 29 | 30 | def parsePoint(line: String): DataPoint = { 31 | val nums = line.split(' ').map(_.toDouble) 32 | DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) 33 | } 34 | 35 | def main(args: Array[String]) { 36 | val lines = scala.io.Source.fromFile(args(0)).getLines().toArray 37 | val points = lines.map(parsePoint _) 38 | val ITERATIONS = args(1).toInt 39 | 40 | // Initialize w to a random value 41 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 42 | println("Initial w: " + w) 43 | 44 | for (i <- 1 to ITERATIONS) { 45 | println("On iteration " + i) 46 | var gradient = DenseVector.zeros[Double](D) 47 | for (p <- points) { 48 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y 49 | gradient += p.x * scale 50 | } 51 | w -= gradient 52 | } 53 | 54 | println("Final w: " + w) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import breeze.linalg.{Vector, DenseVector} 23 | 24 | /** 25 | * Logistic regression based classification. 26 | */ 27 | object LocalLR { 28 | val N = 10000 // Number of data points 29 | val D = 10 // Number of dimensions 30 | val R = 0.7 // Scaling factor 31 | val ITERATIONS = 5 32 | val rand = new Random(42) 33 | 34 | case class DataPoint(x: Vector[Double], y: Double) 35 | 36 | def generateData = { 37 | def generatePoint(i: Int) = { 38 | val y = if(i % 2 == 0) -1 else 1 39 | val x = DenseVector.fill(D){rand.nextGaussian + y * R} 40 | DataPoint(x, y) 41 | } 42 | Array.tabulate(N)(generatePoint) 43 | } 44 | 45 | def main(args: Array[String]) { 46 | val data = generateData 47 | 48 | // Initialize w to a random value 49 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 50 | println("Initial w: " + w) 51 | 52 | for (i <- 1 to ITERATIONS) { 53 | println("On iteration " + i) 54 | var gradient = DenseVector.zeros[Double](D) 55 | for (p <- data) { 56 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y 57 | gradient += p.x * scale 58 | } 59 | w -= gradient 60 | } 61 | 62 | println("Final w: " + w) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.SparkContext._ 24 | 25 | object LocalPi { 26 | def main(args: Array[String]) { 27 | var count = 0 28 | for (i <- 1 to 100000) { 29 | val x = random * 2 - 1 30 | val y = random * 2 - 1 31 | if (x*x + y*y < 1) count += 1 32 | } 33 | println("Pi is roughly " + 4 * count / 100000.0) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | /** 24 | * Usage: MultiBroadcastTest [slices] [numElem] 25 | */ 26 | object MultiBroadcastTest { 27 | def main(args: Array[String]) { 28 | 29 | val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test") 30 | val sc = new SparkContext(sparkConf) 31 | 32 | val slices = if (args.length > 0) args(0).toInt else 2 33 | val num = if (args.length > 1) args(1).toInt else 1000000 34 | 35 | val arr1 = new Array[Int](num) 36 | for (i <- 0 until arr1.length) { 37 | arr1(i) = i 38 | } 39 | 40 | val arr2 = new Array[Int](num) 41 | for (i <- 0 until arr2.length) { 42 | arr2(i) = i 43 | } 44 | 45 | val barr1 = sc.broadcast(arr1) 46 | val barr2 = sc.broadcast(arr2) 47 | val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ => 48 | (barr1.value.size, barr2.value.size) 49 | } 50 | // Collect the small RDD so we can print the observed sizes locally. 51 | observedSizes.collect().foreach(i => println(i)) 52 | 53 | sc.stop() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] 27 | */ 28 | object SkewedGroupByTest { 29 | def main(args: Array[String]) { 30 | val sparkConf = new SparkConf().setAppName("GroupBy Test") 31 | var numMappers = if (args.length > 0) args(0).toInt else 2 32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000 33 | var valSize = if (args.length > 2) args(2).toInt else 1000 34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 39 | val ranGen = new Random 40 | 41 | // map output sizes lineraly increase from the 1st to the last 42 | numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt 43 | 44 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs) 45 | for (i <- 0 until numKVPairs) { 46 | val byteArr = new Array[Byte](valSize) 47 | ranGen.nextBytes(byteArr) 48 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) 49 | } 50 | arr1 51 | }.cache() 52 | // Enforce that everything has been calculated and in cache 53 | pairs1.count() 54 | 55 | println(pairs1.groupByKey(numReducers).count()) 56 | 57 | sc.stop() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.math.exp 23 | 24 | import breeze.linalg.{Vector, DenseVector} 25 | 26 | import org.apache.spark._ 27 | 28 | /** 29 | * Logistic regression based classification. 30 | * Usage: SparkLR [slices] 31 | */ 32 | object SparkLR { 33 | val N = 10000 // Number of data points 34 | val D = 10 // Numer of dimensions 35 | val R = 0.7 // Scaling factor 36 | val ITERATIONS = 5 37 | val rand = new Random(42) 38 | 39 | case class DataPoint(x: Vector[Double], y: Double) 40 | 41 | def generateData = { 42 | def generatePoint(i: Int) = { 43 | val y = if(i % 2 == 0) -1 else 1 44 | val x = DenseVector.fill(D){rand.nextGaussian + y * R} 45 | DataPoint(x, y) 46 | } 47 | Array.tabulate(N)(generatePoint) 48 | } 49 | 50 | def main(args: Array[String]) { 51 | val sparkConf = new SparkConf().setAppName("SparkLR") 52 | val sc = new SparkContext(sparkConf) 53 | val numSlices = if (args.length > 0) args(0).toInt else 2 54 | val points = sc.parallelize(generateData, numSlices).cache() 55 | 56 | // Initialize w to a random value 57 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 58 | println("Initial w: " + w) 59 | 60 | for (i <- 1 to ITERATIONS) { 61 | println("On iteration " + i) 62 | val gradient = points.map { p => 63 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y 64 | }.reduce(_ + _) 65 | w -= gradient 66 | } 67 | 68 | println("Final w: " + w) 69 | sc.stop() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | /** 24 | * Computes the PageRank of URLs from an input file. Input file should 25 | * be in format of: 26 | * URL neighbor URL 27 | * URL neighbor URL 28 | * URL neighbor URL 29 | * ... 30 | * where URL and their neighbors are separated by space(s). 31 | */ 32 | object SparkPageRank { 33 | def main(args: Array[String]) { 34 | val sparkConf = new SparkConf().setAppName("PageRank") 35 | var iters = args(1).toInt 36 | val ctx = new SparkContext(sparkConf) 37 | val lines = ctx.textFile(args(0), 1) 38 | val links = lines.map{ s => 39 | val parts = s.split("\\s+") 40 | (parts(0), parts(1)) 41 | }.distinct().groupByKey().cache() 42 | var ranks = links.mapValues(v => 1.0) 43 | 44 | for (i <- 1 to iters) { 45 | val contribs = links.join(ranks).values.flatMap{ case (urls, rank) => 46 | val size = urls.size 47 | urls.map(url => (url, rank / size)) 48 | } 49 | ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _) 50 | } 51 | 52 | val output = ranks.collect() 53 | output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + ".")) 54 | 55 | ctx.stop() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | 24 | /** Computes an approximation to pi */ 25 | object SparkPi { 26 | def main(args: Array[String]) { 27 | val conf = new SparkConf().setAppName("Spark Pi") 28 | val spark = new SparkContext(conf) 29 | val slices = if (args.length > 0) args(0).toInt else 2 30 | val n = 100000 * slices 31 | val count = spark.parallelize(1 to n, slices).map { i => 32 | val x = random * 2 - 1 33 | val y = random * 2 - 1 34 | if (x*x + y*y < 1) 1 else 0 35 | }.reduce(_ + _) 36 | println("Pi is roughly " + 4.0 * count / n) 37 | spark.stop() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.util.Random 21 | import scala.collection.mutable 22 | 23 | import org.apache.spark.{SparkConf, SparkContext} 24 | import org.apache.spark.SparkContext._ 25 | 26 | /** 27 | * Transitive closure on a graph. 28 | */ 29 | object SparkTC { 30 | val numEdges = 200 31 | val numVertices = 100 32 | val rand = new Random(42) 33 | 34 | def generateGraph = { 35 | val edges: mutable.Set[(Int, Int)] = mutable.Set.empty 36 | while (edges.size < numEdges) { 37 | val from = rand.nextInt(numVertices) 38 | val to = rand.nextInt(numVertices) 39 | if (from != to) edges.+=((from, to)) 40 | } 41 | edges.toSeq 42 | } 43 | 44 | def main(args: Array[String]) { 45 | val sparkConf = new SparkConf().setAppName("SparkTC") 46 | val spark = new SparkContext(sparkConf) 47 | val slices = if (args.length > 0) args(0).toInt else 2 48 | var tc = spark.parallelize(generateGraph, slices).cache() 49 | 50 | // Linear transitive closure: each round grows paths by one edge, 51 | // by joining the graph's edges with the already-discovered paths. 52 | // e.g. join the path (y, z) from the TC with the edge (x, y) from 53 | // the graph to obtain the path (x, z). 54 | 55 | // Because join() joins on keys, the edges are stored in reversed order. 56 | val edges = tc.map(x => (x._2, x._1)) 57 | 58 | // This join is iterated until a fixed point is reached. 59 | var oldCount = 0L 60 | var nextCount = tc.count() 61 | do { 62 | oldCount = nextCount 63 | // Perform the join, obtaining an RDD of (y, (z, x)) pairs, 64 | // then project the result to obtain the new (x, z) paths. 65 | tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache() 66 | nextCount = tc.count() 67 | } while (nextCount != oldCount) 68 | 69 | println("TC has " + tc.count() + " edges.") 70 | spark.stop() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.storage.StorageLevel 24 | 25 | /** 26 | * Computes an approximation to pi 27 | * This example uses Tachyon to persist rdds during computation. 28 | */ 29 | object SparkTachyonPi { 30 | def main(args: Array[String]) { 31 | val sparkConf = new SparkConf().setAppName("SparkTachyonPi") 32 | val spark = new SparkContext(sparkConf) 33 | 34 | val slices = if (args.length > 0) args(0).toInt else 2 35 | val n = 100000 * slices 36 | 37 | val rdd = spark.parallelize(1 to n, slices) 38 | rdd.persist(StorageLevel.OFF_HEAP) 39 | val count = rdd.map { i => 40 | val x = random * 2 - 1 41 | val y = random * 2 - 1 42 | if (x * x + y * y < 1) 1 else 0 43 | }.reduce(_ + _) 44 | println("Pi is roughly " + 4.0 * count / n) 45 | 46 | spark.stop() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.graphx 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark._ 22 | import org.apache.spark.graphx._ 23 | import org.apache.spark.graphx.lib.Analytics 24 | 25 | /** 26 | * Uses GraphX to run PageRank on a LiveJournal social network graph. Download the dataset from 27 | * http://snap.stanford.edu/data/soc-LiveJournal1.html. 28 | */ 29 | object LiveJournalPageRank { 30 | def main(args: Array[String]) { 31 | if (args.length < 1) { 32 | System.err.println( 33 | "Usage: LiveJournalPageRank \n" + 34 | " [--tol=]\n" + 35 | " The tolerance allowed at convergence (smaller => more accurate). Default is " + 36 | "0.001.\n" + 37 | " [--output=]\n" + 38 | " If specified, the file to write the ranks to.\n" + 39 | " [--numEPart=]\n" + 40 | " The number of partitions for the graph's edge RDD. Default is 4.\n" + 41 | " [--partStrategy=RandomVertexCut | EdgePartition1D | EdgePartition2D | " + 42 | "CanonicalRandomVertexCut]\n" + 43 | " The way edges are assigned to edge partitions. Default is RandomVertexCut.") 44 | System.exit(-1) 45 | } 46 | 47 | Analytics.main(args.patch(0, List("pagerank"), 0)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.mllib 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 22 | import org.apache.spark.mllib.linalg.Vectors 23 | 24 | /** 25 | * Compute the principal components of a tall-and-skinny matrix, whose rows are observations. 26 | * 27 | * The input matrix must be stored in row-oriented dense format, one line per row with its entries 28 | * separated by space. For example, 29 | * {{{ 30 | * 0.5 1.0 31 | * 2.0 3.0 32 | * 4.0 5.0 33 | * }}} 34 | * represents a 3-by-2 matrix, whose first row is (0.5, 1.0). 35 | */ 36 | object TallSkinnyPCA { 37 | def main(args: Array[String]) { 38 | if (args.length != 1) { 39 | System.err.println("Usage: TallSkinnyPCA ") 40 | System.exit(1) 41 | } 42 | 43 | val conf = new SparkConf().setAppName("TallSkinnyPCA") 44 | val sc = new SparkContext(conf) 45 | 46 | // Load and parse the data file. 47 | val rows = sc.textFile(args(0)).map { line => 48 | val values = line.split(' ').map(_.toDouble) 49 | Vectors.dense(values) 50 | } 51 | val mat = new RowMatrix(rows) 52 | 53 | // Compute principal components. 54 | val pc = mat.computePrincipalComponents(mat.numCols().toInt) 55 | 56 | println("Principal components are:\n" + pc) 57 | 58 | sc.stop() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.mllib 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 22 | import org.apache.spark.mllib.linalg.Vectors 23 | 24 | /** 25 | * Compute the singular value decomposition (SVD) of a tall-and-skinny matrix. 26 | * 27 | * The input matrix must be stored in row-oriented dense format, one line per row with its entries 28 | * separated by space. For example, 29 | * {{{ 30 | * 0.5 1.0 31 | * 2.0 3.0 32 | * 4.0 5.0 33 | * }}} 34 | * represents a 3-by-2 matrix, whose first row is (0.5, 1.0). 35 | */ 36 | object TallSkinnySVD { 37 | def main(args: Array[String]) { 38 | if (args.length != 1) { 39 | System.err.println("Usage: TallSkinnySVD ") 40 | System.exit(1) 41 | } 42 | 43 | val conf = new SparkConf().setAppName("TallSkinnySVD") 44 | val sc = new SparkContext(conf) 45 | 46 | // Load and parse the data file. 47 | val rows = sc.textFile(args(0)).map { line => 48 | val values = line.split(' ').map(_.toDouble) 49 | Vectors.dense(values) 50 | } 51 | val mat = new RowMatrix(rows) 52 | 53 | // Compute SVD. 54 | val svd = mat.computeSVD(mat.numCols().toInt) 55 | 56 | println("Singular values are " + svd.s) 57 | 58 | sc.stop() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.storage.StorageLevel 22 | import org.apache.spark.streaming._ 23 | import org.apache.spark.streaming.flume._ 24 | import org.apache.spark.util.IntParam 25 | 26 | /** 27 | * Produces a count of events received from Flume. 28 | * 29 | * This should be used in conjunction with an AvroSink in Flume. It will start 30 | * an Avro server on at the request host:port address and listen for requests. 31 | * Your Flume AvroSink should be pointed to this address. 32 | * 33 | * Usage: FlumeEventCount 34 | * is the host the Flume receiver will be started on - a receiver 35 | * creates a server and listens for flume events. 36 | * is the port the Flume receiver will listen on. 37 | * 38 | * To run this example: 39 | * `$ bin/run-example org.apache.spark.examples.streaming.FlumeEventCount ` 40 | */ 41 | object FlumeEventCount { 42 | def main(args: Array[String]) { 43 | if (args.length < 2) { 44 | System.err.println( 45 | "Usage: FlumeEventCount ") 46 | System.exit(1) 47 | } 48 | 49 | StreamingExamples.setStreamingLogLevels() 50 | 51 | val Array(host, IntParam(port)) = args 52 | 53 | val batchInterval = Milliseconds(2000) 54 | 55 | // Create the context and set the batch size 56 | val sparkConf = new SparkConf().setAppName("FlumeEventCount") 57 | val ssc = new StreamingContext(sparkConf, batchInterval) 58 | 59 | // Create a flume stream 60 | val stream = FlumeUtils.createStream(ssc, host, port, StorageLevel.MEMORY_ONLY_SER_2) 61 | 62 | // Print out the count of events received from this server in each batch 63 | stream.count().map(cnt => "Received " + cnt + " flume events." ).print() 64 | 65 | ssc.start() 66 | ssc.awaitTermination() 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.streaming.{Seconds, StreamingContext} 22 | import org.apache.spark.streaming.StreamingContext._ 23 | 24 | /** 25 | * Counts words in new text files created in the given directory 26 | * Usage: HdfsWordCount 27 | * is the directory that Spark Streaming will use to find and read new text files. 28 | * 29 | * To run this on your local machine on directory `localdir`, run this example 30 | * $ bin/run-example \ 31 | * org.apache.spark.examples.streaming.HdfsWordCount localdir 32 | * 33 | * Then create a text file in `localdir` and the words in the file will get counted. 34 | */ 35 | object HdfsWordCount { 36 | def main(args: Array[String]) { 37 | if (args.length < 1) { 38 | System.err.println("Usage: HdfsWordCount ") 39 | System.exit(1) 40 | } 41 | 42 | StreamingExamples.setStreamingLogLevels() 43 | val sparkConf = new SparkConf().setAppName("HdfsWordCount") 44 | // Create the context 45 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 46 | 47 | // Create the FileInputDStream on the directory and use the 48 | // stream to count words in new files created 49 | val lines = ssc.textFileStream(args(0)) 50 | val words = lines.flatMap(_.split(" ")) 51 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) 52 | wordCounts.print() 53 | ssc.start() 54 | ssc.awaitTermination() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.streaming.{Seconds, StreamingContext} 22 | import org.apache.spark.streaming.StreamingContext._ 23 | import org.apache.spark.storage.StorageLevel 24 | 25 | /** 26 | * Counts words in UTF8 encoded, '\n' delimited text received from the network every second. 27 | * 28 | * Usage: NetworkWordCount 29 | * and describe the TCP server that Spark Streaming would connect to receive data. 30 | * 31 | * To run this on your local machine, you need to first run a Netcat server 32 | * `$ nc -lk 9999` 33 | * and then run the example 34 | * `$ bin/run-example org.apache.spark.examples.streaming.NetworkWordCount localhost 9999` 35 | */ 36 | object NetworkWordCount { 37 | def main(args: Array[String]) { 38 | if (args.length < 2) { 39 | System.err.println("Usage: NetworkWordCount ") 40 | System.exit(1) 41 | } 42 | 43 | StreamingExamples.setStreamingLogLevels() 44 | 45 | // Create the context with a 1 second batch size 46 | val sparkConf = new SparkConf().setAppName("NetworkWordCount") 47 | val ssc = new StreamingContext(sparkConf, Seconds(1)) 48 | 49 | // Create a socket stream on target ip:port and count the 50 | // words in input stream of \n delimited text (eg. generated by 'nc') 51 | // Note that no duplication in storage level only for running locally. 52 | // Replication necessary in distributed scenario for fault tolerance. 53 | val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) 54 | val words = lines.flatMap(_.split(" ")) 55 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) 56 | wordCounts.print() 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming 19 | 20 | import scala.collection.mutable.SynchronizedQueue 21 | 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.rdd.RDD 24 | import org.apache.spark.streaming.{Seconds, StreamingContext} 25 | import org.apache.spark.streaming.StreamingContext._ 26 | 27 | object QueueStream { 28 | 29 | def main(args: Array[String]) { 30 | 31 | StreamingExamples.setStreamingLogLevels() 32 | val sparkConf = new SparkConf().setAppName("QueueStream") 33 | // Create the context 34 | val ssc = new StreamingContext(sparkConf, Seconds(1)) 35 | 36 | // Create the queue through which RDDs can be pushed to 37 | // a QueueInputDStream 38 | val rddQueue = new SynchronizedQueue[RDD[Int]]() 39 | 40 | // Create the QueueInputDStream and use it do some processing 41 | val inputStream = ssc.queueStream(rddQueue) 42 | val mappedStream = inputStream.map(x => (x % 10, 1)) 43 | val reducedStream = mappedStream.reduceByKey(_ + _) 44 | reducedStream.print() 45 | ssc.start() 46 | 47 | // Create and push some RDDs into 48 | for (i <- 1 to 30) { 49 | rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) 50 | Thread.sleep(1000) 51 | } 52 | ssc.stop() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/examples/src/main/scala/org/apache/spark/examples/streaming/StreamingExamples.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming 19 | 20 | import org.apache.spark.Logging 21 | 22 | import org.apache.log4j.{Level, Logger} 23 | 24 | /** Utility functions for Spark Streaming examples. */ 25 | object StreamingExamples extends Logging { 26 | 27 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */ 28 | def setStreamingLogLevels() { 29 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 30 | if (!log4jInitialized) { 31 | // We first log something to initialize Spark's default logging, then we override the 32 | // logging level. 33 | logInfo("Setting log level to [WARN] for streaming example." + 34 | " To override add a custom log4j.properties to the classpath.") 35 | Logger.getRootLogger.setLevel(Level.WARN) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.1 -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/lib/spark-assembly.2 -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | docs/ 3 | pyspark.egg-info 4 | build/ 5 | dist/ 6 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/epydoc.conf: -------------------------------------------------------------------------------- 1 | [epydoc] # Epydoc section marker (required by ConfigParser) 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Information about the project. 21 | name: Spark 1.0.0 Python API Docs 22 | url: http://spark.apache.org 23 | 24 | # The list of modules to document. Modules can be named using 25 | # dotted names, module filenames, or package directory names. 26 | # This option may be repeated. 27 | modules: pyspark 28 | 29 | # Write html output to the directory "apidocs" 30 | output: html 31 | target: docs/ 32 | 33 | private: no 34 | 35 | exclude: pyspark.cloudpickle pyspark.worker pyspark.join 36 | pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests 37 | pyspark.rddsampler pyspark.daemon pyspark.mllib._common 38 | pyspark.mllib.tests 39 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/lib/PY4J_LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | - Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | - Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | - The name of the author may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/lib/py4j-0.8.1-src.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/python/lib/py4j-0.8.1-src.zip -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | PySpark is the Python API for Spark. 20 | 21 | Public classes: 22 | 23 | - L{SparkContext} 24 | Main entry point for Spark functionality. 25 | - L{RDD} 26 | A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. 27 | - L{Broadcast} 28 | A broadcast variable that gets reused across tasks. 29 | - L{Accumulator} 30 | An "add-only" shared variable that tasks can only add values to. 31 | - L{SparkConf} 32 | For configuring Spark. 33 | - L{SparkFiles} 34 | Access files shipped with jobs. 35 | - L{StorageLevel} 36 | Finer-grained cache persistence levels. 37 | 38 | Spark SQL: 39 | - L{SQLContext} 40 | Main entry point for SQL functionality. 41 | - L{SchemaRDD} 42 | A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In 43 | addition to normal RDD operations, SchemaRDDs also support SQL. 44 | - L{Row} 45 | A Row of data returned by a Spark SQL query. 46 | 47 | Hive: 48 | - L{HiveContext} 49 | Main entry point for accessing data stored in Apache Hive.. 50 | """ 51 | 52 | from pyspark.conf import SparkConf 53 | from pyspark.context import SparkContext 54 | from pyspark.sql import SQLContext 55 | from pyspark.rdd import RDD 56 | from pyspark.sql import SchemaRDD 57 | from pyspark.sql import Row 58 | from pyspark.files import SparkFiles 59 | from pyspark.storagelevel import StorageLevel 60 | 61 | 62 | __all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD", "SparkFiles", "StorageLevel", "Row"] 63 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/broadcast.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | >>> from pyspark.context import SparkContext 20 | >>> sc = SparkContext('local', 'test') 21 | >>> b = sc.broadcast([1, 2, 3, 4, 5]) 22 | >>> b.value 23 | [1, 2, 3, 4, 5] 24 | 25 | >>> from pyspark.broadcast import _broadcastRegistry 26 | >>> _broadcastRegistry[b.bid] = b 27 | >>> from cPickle import dumps, loads 28 | >>> loads(dumps(b)).value 29 | [1, 2, 3, 4, 5] 30 | 31 | >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() 32 | [1, 2, 3, 4, 5, 1, 2, 3, 4, 5] 33 | 34 | >>> large_broadcast = sc.broadcast(list(range(10000))) 35 | """ 36 | # Holds broadcasted data received from Java, keyed by its id. 37 | _broadcastRegistry = {} 38 | 39 | 40 | def _from_id(bid): 41 | from pyspark.broadcast import _broadcastRegistry 42 | if bid not in _broadcastRegistry: 43 | raise Exception("Broadcast variable '%s' not loaded!" % bid) 44 | return _broadcastRegistry[bid] 45 | 46 | 47 | class Broadcast(object): 48 | """ 49 | A broadcast variable created with 50 | L{SparkContext.broadcast()}. 51 | Access its value through C{.value}. 52 | """ 53 | 54 | def __init__(self, bid, value, java_broadcast=None, pickle_registry=None): 55 | """ 56 | Should not be called directly by users -- use 57 | L{SparkContext.broadcast()} 58 | instead. 59 | """ 60 | self.value = value 61 | self.bid = bid 62 | self._jbroadcast = java_broadcast 63 | self._pickle_registry = pickle_registry 64 | 65 | def __reduce__(self): 66 | self._pickle_registry.add(self) 67 | return (_from_id, (self.bid, )) 68 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/files.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import os 19 | 20 | 21 | class SparkFiles(object): 22 | """ 23 | Resolves paths to files added through 24 | L{SparkContext.addFile()}. 25 | 26 | SparkFiles contains only classmethods; users should not create SparkFiles 27 | instances. 28 | """ 29 | 30 | _root_directory = None 31 | _is_running_on_worker = False 32 | _sc = None 33 | 34 | def __init__(self): 35 | raise NotImplementedError("Do not construct SparkFiles objects") 36 | 37 | @classmethod 38 | def get(cls, filename): 39 | """ 40 | Get the absolute path of a file added through C{SparkContext.addFile()}. 41 | """ 42 | path = os.path.join(SparkFiles.getRootDirectory(), filename) 43 | return os.path.abspath(path) 44 | 45 | @classmethod 46 | def getRootDirectory(cls): 47 | """ 48 | Get the root directory that contains files added through 49 | C{SparkContext.addFile()}. 50 | """ 51 | if cls._is_running_on_worker: 52 | return cls._root_directory 53 | else: 54 | # This will have to change if we support multiple SparkContexts: 55 | return cls._sc._jvm.org.apache.spark.SparkFiles.getRootDirectory() 56 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/mllib/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | Python bindings for MLlib. 20 | """ 21 | 22 | # MLlib currently needs and NumPy 1.4+, so complain if lower 23 | 24 | import numpy 25 | if numpy.version.version < '1.4': 26 | raise Exception("MLlib requires NumPy 1.4+") 27 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/resultiterable.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | __all__ = ["ResultIterable"] 19 | 20 | import collections 21 | 22 | class ResultIterable(collections.Iterable): 23 | """ 24 | A special result iterable. This is used because the standard iterator can not be pickled 25 | """ 26 | def __init__(self, data): 27 | self.data = data 28 | self.index = 0 29 | self.maxindex = len(data) 30 | def __iter__(self): 31 | return iter(self.data) 32 | def __len__(self): 33 | return len(self.data) 34 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/shell.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | An interactive shell. 20 | 21 | This file is designed to be launched as a PYTHONSTARTUP script. 22 | """ 23 | 24 | import sys 25 | if sys.version_info[0] != 2: 26 | print("Error: Default Python used is Python%s" % sys.version_info.major) 27 | print("\tSet env variable PYSPARK_PYTHON to Python2 binary and re-run it.") 28 | sys.exit(1) 29 | 30 | 31 | import os 32 | import platform 33 | import pyspark 34 | from pyspark.context import SparkContext 35 | from pyspark.storagelevel import StorageLevel 36 | 37 | # this is the equivalent of ADD_JARS 38 | add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("ADD_FILES") != None else None 39 | 40 | if os.environ.get("SPARK_EXECUTOR_URI"): 41 | SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) 42 | 43 | sc = SparkContext(appName="PySparkShell", pyFiles=add_files) 44 | 45 | print("""Welcome to 46 | ____ __ 47 | / __/__ ___ _____/ /__ 48 | _\ \/ _ \/ _ `/ __/ '_/ 49 | /__ / .__/\_,_/_/ /_/\_\ version 1.0.0 50 | /_/ 51 | """) 52 | print("Using Python version %s (%s, %s)" % ( 53 | platform.python_version(), 54 | platform.python_build()[0], 55 | platform.python_build()[1])) 56 | print("SparkContext available as sc.") 57 | 58 | if add_files != None: 59 | print("Adding files: [%s]" % ", ".join(add_files)) 60 | 61 | # The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP, 62 | # which allows us to execute the user's PYTHONSTARTUP file: 63 | _pythonstartup = os.environ.get('OLD_PYTHONSTARTUP') 64 | if _pythonstartup and os.path.isfile(_pythonstartup): 65 | execfile(_pythonstartup) 66 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/pyspark/storagelevel.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | __all__ = ["StorageLevel"] 19 | 20 | class StorageLevel: 21 | """ 22 | Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory, 23 | whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory 24 | in a serialized format, and whether to replicate the RDD partitions on multiple nodes. 25 | Also contains static constants for some commonly used storage levels, such as MEMORY_ONLY. 26 | """ 27 | 28 | def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication = 1): 29 | self.useDisk = useDisk 30 | self.useMemory = useMemory 31 | self.useOffHeap = useOffHeap 32 | self.deserialized = deserialized 33 | self.replication = replication 34 | 35 | def __repr__(self): 36 | return "StorageLevel(%s, %s, %s, %s, %s)" % ( 37 | self.useDisk, self.useMemory, self.useOffHeap, self.deserialized, self.replication) 38 | 39 | StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False) 40 | StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2) 41 | StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, True) 42 | StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, True, 2) 43 | StorageLevel.MEMORY_ONLY_SER = StorageLevel(False, True, False, False) 44 | StorageLevel.MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, False, 2) 45 | StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, True) 46 | StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, True, 2) 47 | StorageLevel.MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False) 48 | StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2) 49 | StorageLevel.OFF_HEAP = StorageLevel(False, False, True, False, 1) -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/run-tests: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | 21 | # Figure out where the Spark framework is installed 22 | FWDIR="$(cd `dirname $0`; cd ../; pwd)" 23 | 24 | # CD into the python directory to find things on the right path 25 | cd "$FWDIR/python" 26 | 27 | FAILED=0 28 | 29 | rm -f unit-tests.log 30 | 31 | # Remove the metastore and warehouse directory created by the HiveContext tests in SparkSQL 32 | rm -rf metastore warehouse 33 | 34 | function run_test() { 35 | SPARK_TESTING=0 $FWDIR/bin/pyspark $1 2>&1 | tee -a > unit-tests.log 36 | FAILED=$((PIPESTATUS[0]||$FAILED)) 37 | 38 | # Fail and exit on the first test failure. 39 | if [[ $FAILED != 0 ]]; then 40 | cat unit-tests.log | grep -v "^[0-9][0-9]*" # filter all lines starting with a number. 41 | echo -en "\033[31m" # Red 42 | echo "Had test failures; see logs." 43 | echo -en "\033[0m" # No color 44 | exit -1 45 | fi 46 | 47 | } 48 | 49 | run_test "pyspark/rdd.py" 50 | run_test "pyspark/context.py" 51 | run_test "pyspark/conf.py" 52 | if [ -n "$_RUN_SQL_TESTS" ]; then 53 | run_test "pyspark/sql.py" 54 | fi 55 | run_test "-m doctest pyspark/broadcast.py" 56 | run_test "-m doctest pyspark/accumulators.py" 57 | run_test "-m doctest pyspark/serializers.py" 58 | run_test "pyspark/tests.py" 59 | run_test "pyspark/mllib/_common.py" 60 | run_test "pyspark/mllib/classification.py" 61 | run_test "pyspark/mllib/clustering.py" 62 | run_test "pyspark/mllib/linalg.py" 63 | run_test "pyspark/mllib/recommendation.py" 64 | run_test "pyspark/mllib/regression.py" 65 | run_test "pyspark/mllib/tests.py" 66 | 67 | if [[ $FAILED == 0 ]]; then 68 | echo -en "\033[32m" # Green 69 | echo "Tests passed." 70 | echo -en "\033[0m" # No color 71 | fi 72 | 73 | # TODO: in the long-run, it would be nice to use a test runner like `nose`. 74 | # The doctest fixtures are the current barrier to doing this. 75 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/test_support/hello.txt: -------------------------------------------------------------------------------- 1 | Hello World! 2 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlib-0.1-py2.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe-research/spark-cluster-deployment/3097ae198b2eca6c4afc0d6fb1db1635156f7479/initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlib-0.1-py2.7.egg -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/python/test_support/userlibrary.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | Used to test shipping of code depenencies with SparkContext.addPyFile(). 20 | """ 21 | 22 | class UserClass(object): 23 | def hello(self): 24 | return "Hello World!" 25 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/slaves.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Run a shell command on all slave hosts. 21 | # 22 | # Environment Variables 23 | # 24 | # SPARK_SLAVES File naming remote hosts. 25 | # Default is ${SPARK_CONF_DIR}/slaves. 26 | # SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf. 27 | # SPARK_SLAVE_SLEEP Seconds to sleep between spawning remote commands. 28 | # SPARK_SSH_OPTS Options passed to ssh when running remote commands. 29 | ## 30 | 31 | usage="Usage: slaves.sh [--config ] command..." 32 | 33 | # if no args specified, show usage 34 | if [ $# -le 0 ]; then 35 | echo $usage 36 | exit 1 37 | fi 38 | 39 | sbin=`dirname "$0"` 40 | sbin=`cd "$sbin"; pwd` 41 | 42 | . "$sbin/spark-config.sh" 43 | 44 | # If the slaves file is specified in the command line, 45 | # then it takes precedence over the definition in 46 | # spark-env.sh. Save it here. 47 | HOSTLIST=$SPARK_SLAVES 48 | 49 | # Check if --config is passed as an argument. It is an optional parameter. 50 | # Exit if the argument is not a directory. 51 | if [ "$1" == "--config" ] 52 | then 53 | shift 54 | conf_dir=$1 55 | if [ ! -d "$conf_dir" ] 56 | then 57 | echo "ERROR : $conf_dir is not a directory" 58 | echo $usage 59 | exit 1 60 | else 61 | export SPARK_CONF_DIR=$conf_dir 62 | fi 63 | shift 64 | fi 65 | 66 | . "$SPARK_PREFIX/bin/load-spark-env.sh" 67 | 68 | if [ "$HOSTLIST" = "" ]; then 69 | if [ "$SPARK_SLAVES" = "" ]; then 70 | export HOSTLIST="${SPARK_CONF_DIR}/slaves" 71 | else 72 | export HOSTLIST="${SPARK_SLAVES}" 73 | fi 74 | fi 75 | 76 | # By default disable strict host key checking 77 | if [ "$SPARK_SSH_OPTS" = "" ]; then 78 | SPARK_SSH_OPTS="-o StrictHostKeyChecking=no" 79 | fi 80 | 81 | for slave in `cat "$HOSTLIST"|sed "s/#.*$//;/^$/d"`; do 82 | ssh $SPARK_SSH_OPTS $slave $"${@// /\\ }" \ 83 | 2>&1 | sed "s/^/$slave: /" & 84 | if [ "$SPARK_SLAVE_SLEEP" != "" ]; then 85 | sleep $SPARK_SLAVE_SLEEP 86 | fi 87 | done 88 | 89 | wait 90 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/spark-config.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # included in all the spark scripts with source command 19 | # should not be executable directly 20 | # also should not be passed any arguments, since we need original $* 21 | 22 | # resolve links - $0 may be a softlink 23 | this="${BASH_SOURCE-$0}" 24 | common_bin=$(cd -P -- "$(dirname -- "$this")" && pwd -P) 25 | script="$(basename -- "$this")" 26 | this="$common_bin/$script" 27 | 28 | # convert relative path to absolute path 29 | config_bin=`dirname "$this"` 30 | script=`basename "$this"` 31 | config_bin=`cd "$config_bin"; pwd` 32 | this="$config_bin/$script" 33 | 34 | export SPARK_PREFIX=`dirname "$this"`/.. 35 | export SPARK_HOME=${SPARK_PREFIX} 36 | export SPARK_CONF_DIR="$SPARK_HOME/conf" 37 | # Add the PySpark classes to the PYTHONPATH: 38 | export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH 39 | export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH 40 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/spark-daemons.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Run a Spark command on all slave hosts. 21 | 22 | usage="Usage: spark-daemons.sh [--config ] [start|stop] command instance-number args..." 23 | 24 | # if no args specified, show usage 25 | if [ $# -le 1 ]; then 26 | echo $usage 27 | exit 1 28 | fi 29 | 30 | sbin=`dirname "$0"` 31 | sbin=`cd "$sbin"; pwd` 32 | 33 | . "$sbin/spark-config.sh" 34 | 35 | exec "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/spark-daemon.sh" "$@" 36 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/spark-executor: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | FWDIR="$(cd `dirname $0`/..; pwd)" 21 | 22 | export PYTHONPATH=$FWDIR/python:$PYTHONPATH 23 | export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH 24 | 25 | echo "Running spark-executor with framework dir = $FWDIR" 26 | exec $FWDIR/bin/spark-class org.apache.spark.executor.MesosExecutorBackend 27 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/start-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Start all spark daemons. 21 | # Starts the master on this node. 22 | # Starts a worker on each node specified in conf/slaves 23 | 24 | sbin=`dirname "$0"` 25 | sbin=`cd "$sbin"; pwd` 26 | 27 | TACHYON_STR="" 28 | 29 | while (( "$#" )); do 30 | case $1 in 31 | --with-tachyon) 32 | TACHYON_STR="--with-tachyon" 33 | ;; 34 | esac 35 | shift 36 | done 37 | 38 | # Load the Spark configuration 39 | . "$sbin/spark-config.sh" 40 | 41 | # Start Master 42 | "$sbin"/start-master.sh $TACHYON_STR 43 | 44 | # Start Workers 45 | "$sbin"/start-slaves.sh $TACHYON_STR 46 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/start-history-server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Starts the history server on the machine this script is executed on. 21 | # 22 | # Usage: start-history-server.sh [] 23 | # Example: ./start-history-server.sh --dir /tmp/spark-events --port 18080 24 | # 25 | 26 | sbin=`dirname "$0"` 27 | sbin=`cd "$sbin"; pwd` 28 | 29 | if [ $# -lt 1 ]; then 30 | echo "Usage: ./start-history-server.sh " 31 | echo "Example: ./start-history-server.sh /tmp/spark-events" 32 | exit 33 | fi 34 | 35 | LOG_DIR=$1 36 | 37 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 --dir "$LOG_DIR" 38 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/start-master.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Starts the master on the machine this script is executed on. 21 | 22 | sbin=`dirname "$0"` 23 | sbin=`cd "$sbin"; pwd` 24 | 25 | START_TACHYON=false 26 | 27 | while (( "$#" )); do 28 | case $1 in 29 | --with-tachyon) 30 | if [ ! -e "$sbin"/../tachyon/bin/tachyon ]; then 31 | echo "Error: --with-tachyon specified, but tachyon not found." 32 | exit -1 33 | fi 34 | START_TACHYON=true 35 | ;; 36 | esac 37 | shift 38 | done 39 | 40 | . "$sbin/spark-config.sh" 41 | 42 | . "$SPARK_PREFIX/bin/load-spark-env.sh" 43 | 44 | if [ "$SPARK_MASTER_PORT" = "" ]; then 45 | SPARK_MASTER_PORT=7077 46 | fi 47 | 48 | if [ "$SPARK_MASTER_IP" = "" ]; then 49 | SPARK_MASTER_IP=`hostname` 50 | fi 51 | 52 | if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then 53 | SPARK_MASTER_WEBUI_PORT=8080 54 | fi 55 | 56 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT 57 | 58 | if [ "$START_TACHYON" == "true" ]; then 59 | "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP 60 | "$sbin"/../tachyon/bin/tachyon format -s 61 | "$sbin"/../tachyon/bin/tachyon-start.sh master 62 | fi 63 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/start-slave.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Usage: start-slave.sh 21 | # where is like "spark://localhost:7077" 22 | 23 | sbin=`dirname "$0"` 24 | sbin=`cd "$sbin"; pwd` 25 | 26 | "$sbin"/spark-daemon.sh start org.apache.spark.deploy.worker.Worker "$@" 27 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/start-slaves.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | sbin=`dirname "$0"` 21 | sbin=`cd "$sbin"; pwd` 22 | 23 | 24 | START_TACHYON=false 25 | 26 | while (( "$#" )); do 27 | case $1 in 28 | --with-tachyon) 29 | if [ ! -e "$sbin"/../tachyon/bin/tachyon ]; then 30 | echo "Error: --with-tachyon specified, but tachyon not found." 31 | exit -1 32 | fi 33 | START_TACHYON=true 34 | ;; 35 | esac 36 | shift 37 | done 38 | 39 | . "$sbin/spark-config.sh" 40 | 41 | . "$SPARK_PREFIX/bin/load-spark-env.sh" 42 | 43 | # Find the port number for the master 44 | if [ "$SPARK_MASTER_PORT" = "" ]; then 45 | SPARK_MASTER_PORT=7077 46 | fi 47 | 48 | if [ "$SPARK_MASTER_IP" = "" ]; then 49 | SPARK_MASTER_IP=`hostname` 50 | fi 51 | 52 | if [ "$START_TACHYON" == "true" ]; then 53 | "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP 54 | 55 | # set -t so we can call sudo 56 | SPARK_SSH_OPTS="-o StrictHostKeyChecking=no -t" "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/../tachyon/bin/tachyon-start.sh" worker SudoMount \; sleep 1 57 | fi 58 | 59 | # Launch the slaves 60 | if [ "$SPARK_WORKER_INSTANCES" = "" ]; then 61 | exec "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/start-slave.sh" 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT 62 | else 63 | if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then 64 | SPARK_WORKER_WEBUI_PORT=8081 65 | fi 66 | for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do 67 | "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/start-slave.sh" $(( $i + 1 )) spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT --webui-port $(( $SPARK_WORKER_WEBUI_PORT + $i )) 68 | done 69 | fi 70 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/stop-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Start all spark daemons. 21 | # Run this on the master nde 22 | 23 | 24 | sbin=`dirname "$0"` 25 | sbin=`cd "$sbin"; pwd` 26 | 27 | # Load the Spark configuration 28 | . "$sbin/spark-config.sh" 29 | 30 | # Stop the slaves, then the master 31 | "$sbin"/stop-slaves.sh 32 | "$sbin"/stop-master.sh 33 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/stop-history-server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Stops the history server on the machine this script is executed on. 21 | 22 | sbin=`dirname "$0"` 23 | sbin=`cd "$sbin"; pwd` 24 | 25 | "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.history.HistoryServer 1 26 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/stop-master.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Starts the master on the machine this script is executed on. 21 | 22 | sbin=`dirname "$0"` 23 | sbin=`cd "$sbin"; pwd` 24 | 25 | . "$sbin/spark-config.sh" 26 | 27 | "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.master.Master 1 28 | 29 | if [ -e "$sbin"/../tachyon/bin/tachyon ]; then 30 | "$sbin"/../tachyon/bin/tachyon killAll tachyon.master.Master 31 | fi 32 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/files/spark/sbin/stop-slaves.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | sbin=`dirname "$0"` 21 | sbin=`cd "$sbin"; pwd` 22 | 23 | . "$sbin/spark-config.sh" 24 | 25 | . "$SPARK_PREFIX/bin/load-spark-env.sh" 26 | 27 | # do before the below calls as they exec 28 | if [ -e "$sbin"/../tachyon/bin/tachyon ]; then 29 | "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon killAll tachyon.worker.Worker 30 | fi 31 | 32 | if [ "$SPARK_WORKER_INSTANCES" = "" ]; then 33 | "$sbin"/spark-daemons.sh stop org.apache.spark.deploy.worker.Worker 1 34 | else 35 | for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do 36 | "$sbin"/spark-daemons.sh stop org.apache.spark.deploy.worker.Worker $(( $i + 1 )) 37 | done 38 | fi 39 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/manifests/defaults.pp: -------------------------------------------------------------------------------- 1 | class spark::defaults { 2 | $install_dir = '/usr/lib/spark' 3 | $master_port = 7077 4 | $web_port = 8080 5 | $cores = undef 6 | $memory = undef 7 | $scratch_dir = "${install_dir}/work" 8 | } 9 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/manifests/master.pp: -------------------------------------------------------------------------------- 1 | class spark::master ( 2 | $spark_service_status = 'running', 3 | $master_port = $::spark::defaults::master_port, 4 | $web_port = $::spark::defaults::web_port, 5 | $install_dir = $::spark::defaults::install_dir, 6 | $worker_mem, 7 | ) inherits spark::defaults { 8 | 9 | class {'spark': 10 | master => $::fqdn, 11 | install_dir => $install_dir, 12 | worker_mem => $worker_mem, 13 | } 14 | Class['spark'] -> Class['spark::master'] 15 | 16 | # The Upstart service file. 17 | file {'/etc/init/spark-master.conf': 18 | content => template('spark/spark-master.conf.erb'), 19 | mode => '0644', 20 | owner => 'root', 21 | group => 'root', 22 | notify => Service['spark-master'], 23 | } 24 | 25 | file { "${install_dir}/bin/spark-master-runner.sh": 26 | content => template('spark/spark-master-runner.sh.erb'), 27 | owner => 'root', 28 | group => 'root', 29 | mode => '0744', 30 | } 31 | 32 | # The service that runs the master server. 33 | service {'spark-master': 34 | ensure => $spark_service_status, 35 | require => [File['/etc/init/spark-master.conf'], File["${install_dir}/bin/spark-master-runner.sh"]], 36 | hasrestart => true, 37 | hasstatus => true, 38 | restart => '/sbin/initctl restart spark-master', 39 | start => '/sbin/initctl start spark-master', 40 | stop => '/sbin/initctl stop spark-master', 41 | status => '/sbin/initctl status spark-master | grep "/running" 1>/dev/null 2>&1', 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/manifests/spark.pp: -------------------------------------------------------------------------------- 1 | class spark ( 2 | $master, 3 | $worker_mem, 4 | $install_dir 5 | ) { 6 | require spark::user 7 | 8 | 9 | # Better would be if they had a package repository available, but they do not at this moment. 10 | # (Nor do I, so this is the cleanest way without package managers). 11 | file {$install_dir: 12 | ensure => directory, 13 | source => 'puppet:///modules/spark/spark', 14 | mode => '0744', 15 | recurse => true, 16 | owner => 'root', 17 | group => 'root', 18 | require => User['spark'], 19 | } 20 | 21 | 22 | file {"${install_dir}/conf/spark-env.sh": 23 | content => template('spark/spark-env.sh.erb'), 24 | mode => '0744', 25 | owner => 'root', 26 | group => 'root', 27 | require => File[$install_dir], 28 | } 29 | 30 | #file {"${install_dir}/conf/metrics.properties": 31 | # content => template('spark/metrics.properties.erb'), 32 | # mode => '0744', 33 | # owner => 'root', 34 | # group => 'root', 35 | # require => File[$install_dir], 36 | #} 37 | 38 | 39 | # Create log dir for logging. 40 | file {'/var/log/spark': 41 | ensure => directory, 42 | owner => 'root', 43 | group => 'root', 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/manifests/user.pp: -------------------------------------------------------------------------------- 1 | class spark::user { 2 | 3 | group {'spark': 4 | ensure => present, 5 | } 6 | 7 | user {'spark': 8 | ensure => present, 9 | shell => '/bin/bash', 10 | gid => 'spark', 11 | require => Group['spark'], 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/manifests/worker.pp: -------------------------------------------------------------------------------- 1 | class spark::worker ( 2 | $master, 3 | $spark_service_status = 'running', 4 | $master_port = $::spark::defaults::master_port, 5 | $web_port = $::spark::defaults::web_port, 6 | $install_dir = $::spark::defaults::install_dir, 7 | $cores = $::spark::defaults::cores, 8 | $memory = $::spark::defaults::memory, 9 | $scratch_dir = $::spark::defaults::scratch_dir, 10 | ) inherits spark::defaults { 11 | 12 | class {'spark': 13 | master => $master, 14 | install_dir => $install_dir, 15 | worker_mem => $memory, 16 | 17 | } 18 | Class['spark'] -> Class['spark::worker'] 19 | 20 | # The Upstart service file. 21 | file {'/etc/init/spark-worker.conf': 22 | content => template('spark/spark-worker.conf.erb'), 23 | mode => '0644', 24 | owner => 'root', 25 | group => 'root', 26 | notify => Service['spark-worker'], 27 | } 28 | 29 | file { "${install_dir}/bin/spark-worker-runner.sh": 30 | content => template('spark/spark-worker-runner.sh.erb'), 31 | owner => 'root', 32 | group => 'root', 33 | mode => '0744', 34 | } 35 | 36 | # The service that runs the master server. 37 | service {'spark-worker': 38 | ensure => $spark_service_status, 39 | #provider => 'upstart', 40 | require => [File['/etc/init/spark-worker.conf'], File["${install_dir}/bin/spark-worker-runner.sh"]], 41 | hasrestart => true, 42 | hasstatus => true, 43 | restart => '/sbin/initctl restart spark-worker', 44 | start => '/sbin/initctl start spark-worker', 45 | stop => '/sbin/initctl stop spark-worker', 46 | status => '/sbin/initctl status spark-worker | grep "/running" 1>/dev/null 2>&1' 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/templates/spark-env.sh.erb: -------------------------------------------------------------------------------- 1 | #export SCALA_HOME=/opt/scala-2.9.3 2 | 3 | <%# SPARK_MASTER_OPTS="-Dspark.deploy.spreadOut=false" %> 4 | SPARK_JAVA_OPTS+=" -Dspark.local.dir=/raid/spark-local" 5 | <%# SPARK_JAVA_OPTS+=" -Dspark.speculation=true" %> 6 | #SPARK_JAVA_OPTS+="-XX:MaxPermSize=512m" 7 | #SPARK_JAVA_OPTS+="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=9012 -Dcom.sun.management.jmxremote.local.only=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" 8 | #export SPARK_JAVA_OPTS 9 | 10 | export SPARK_MEM=<%= @worker_mem %> 11 | export SPARK_DAEMON_MEMORY=1g 12 | export SPARK_LIBRARY_PATH="/usr/lib/hadoop/lib/native:$SPARK_LIBRARY_PATH" 13 | export SPARK_CLASSPATH="/usr/lib/tachyon/target/tachyon-0.4.1-jar-with-dependencies.jar:$SPARK_CLASSPATH" 14 | 15 | # Bind Spark's web UIs to this machine's public EC2 hostname: 16 | #export SPARK_PUBLIC_DNS=`wget -q -O - http://instance-data.ec2.internal/latest/meta-data/public-hostname` 17 | 18 | # Set a high ulimit for large shuffles 19 | ulimit -n 1000000 20 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/templates/spark-master-runner.sh.erb: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function borrowed from the original Spark source. 4 | spark_rotate_log () 5 | { 6 | log=/var/log/spark/master.log; 7 | num=5; 8 | if [ -n "$2" ]; then 9 | num=$2 10 | fi 11 | if [ -f "$log" ]; then # rotate logs 12 | while [ $num -gt 1 ]; do 13 | prev=`expr $num - 1` 14 | [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num" 15 | num=$prev 16 | done 17 | mv "$log" "$log.$num"; 18 | fi 19 | } 20 | 21 | spark_rotate_log 22 | <%= @install_dir %>/bin/spark-class org.apache.spark.deploy.master.Master --ip <%= @fqdn %> --webui-port <%= @web_port %> --port <%= @master_port %> >> /var/log/spark/master.log 2>&1 < /dev/null 23 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/templates/spark-master.conf.erb: -------------------------------------------------------------------------------- 1 | description "Spark Master Service script" 2 | start on runlevel [2345] 3 | stop on runlevel [06] 4 | #setuid root 5 | #setgid root 6 | #console log 7 | 8 | chdir <%= @install_dir %> 9 | exec <%= @install_dir %>/bin/spark-master-runner.sh 10 | 11 | # Try to respawn with a maximum of 10 times in a 90 second window. 12 | respawn 13 | respawn limit 10 90 14 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/templates/spark-worker-runner.sh.erb: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function borrowed from the original Spark source. 4 | spark_rotate_log () 5 | { 6 | log=/var/log/spark/worker.log; 7 | num=5; 8 | if [ -n "$2" ]; then 9 | num=$2 10 | fi 11 | if [ -f "$log" ]; then # rotate logs 12 | while [ $num -gt 1 ]; do 13 | prev=`expr $num - 1` 14 | [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num" 15 | num=$prev 16 | done 17 | mv "$log" "$log.$num"; 18 | fi 19 | } 20 | 21 | spark_rotate_log 22 | 23 | export SPARK_LIBRARY_PATH="/usr/lib/hadoop/lib/native:$SPARK_LIBRARY_PATH" 24 | 25 | <%= @install_dir %>/bin/spark-class org.apache.spark.deploy.worker.Worker spark://<%= @master %>:<%= @master_port %> --work-dir <%= @scratch_dir %><% if @cores -%> --cores <%= @cores %><% end -%><% if @memory -%> --memory <%= @memory %><% end -%> >> /var/log/spark/worker.log 2>&1 < /dev/null 26 | -------------------------------------------------------------------------------- /initial-deployment-puppet/modules/spark/templates/spark-worker.conf.erb: -------------------------------------------------------------------------------- 1 | description "Spark Worker Service script" 2 | start on runlevel [2345] 3 | stop on runlevel [06] 4 | #setuid root 5 | #setgid root 6 | #console log 7 | 8 | chdir <%= @install_dir %> 9 | exec <%= @install_dir %>/bin/spark-worker-runner.sh 10 | 11 | # Try to respawn with a maximum of 10 times in a 90 second window. 12 | respawn 13 | respawn limit 10 90 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYaml==3.11 2 | -------------------------------------------------------------------------------- /sample-application/.gitignore: -------------------------------------------------------------------------------- 1 | stderr.txt 2 | stdout.txt 3 | target 4 | project/{project,target} 5 | *.pickle 6 | *.log 7 | -------------------------------------------------------------------------------- /sample-application/build.sbt: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////// 2 | // 3 | // Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////////// 18 | 19 | import AssemblyKeys._ 20 | 21 | assemblySettings 22 | 23 | jarName in assembly := "ExampleApp.jar" 24 | 25 | name := "Example App" 26 | 27 | version := "1.0" 28 | 29 | scalaVersion := "2.10.3" 30 | 31 | // Load "provided" libraries with `sbt run`. 32 | run in Compile <<= Defaults.runTask( 33 | fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run) 34 | ) 35 | 36 | libraryDependencies ++= Seq( 37 | "org.apache.spark" %% "spark-core" % "1.0.0" % "provided", 38 | "org.slf4j" % "slf4j-simple" % "1.7.7" // Logging. 39 | ) 40 | 41 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 42 | -------------------------------------------------------------------------------- /sample-application/config.yaml.tmpl: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | ## 3 | ## Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved. 4 | ## 5 | ## Licensed under the Apache License, Version 2.0 (the "License"); 6 | ## you may not use this file except in compliance with the License. 7 | ## You may obtain a copy of the License at 8 | ## 9 | ## http://www.apache.org/licenses/LICENSE-2.0 10 | ## 11 | ## Unless required by applicable law or agreed to in writing, software 12 | ## distributed under the License is distributed on an "AS IS" BASIS, 13 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ## See the License for the specific language governing permissions and 15 | ## limitations under the License. 16 | ## 17 | ########################################################################### 18 | 19 | jar: ExampleApp.jar 20 | local_jar_dir: target/scala-2.10/ 21 | remote_jar_dir: /tmp/ 22 | main_class: com.adobe.ExampleApp 23 | remote_spark_dir: /usr/lib/spark 24 | spark_master: spark://server_hostname:7077 25 | spark_work: /raid/spark-work 26 | -------------------------------------------------------------------------------- /sample-application/src/main/scala/ExampleApp.scala: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////// 2 | // 3 | // Copyright (c) 2014 Adobe Systems Incorporated. All rights reserved. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////////// 18 | 19 | package com.adobe 20 | 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.SparkContext 23 | import org.apache.spark.SparkContext._ 24 | 25 | import java.io.{File,PrintWriter} 26 | 27 | object ExampleApp { 28 | def main(args: Array[String]) { 29 | val conf = new SparkConf() 30 | .setAppName("ExampleApp") 31 | .setMaster("spark://spark_master_hostname:7077") 32 | .setSparkHome("/usr/lib/spark") 33 | .setJars(Seq("/tmp/ExampleApp.jar")) 34 | .set("spark.executor.memory", "10g") 35 | .set("spark.cores.max", "4") 36 | val sc = new SparkContext(conf) 37 | val nums = sc.parallelize(Seq(1,2,4,8)) 38 | val squares = nums.map{case num => num*num} 39 | println("Nums: " + nums.collect().mkString(", ")) 40 | println("Squares: " + squares.collect().mkString(", ")) 41 | sc.stop() 42 | } 43 | } 44 | --------------------------------------------------------------------------------