├── .gitignore ├── README.md ├── apache-hadoop-hdfs-precise ├── Dockerfile ├── build └── files │ ├── authorized_keys │ ├── configure_hadoop.sh │ ├── core-site.xml │ ├── hdfs-site.xml │ └── id_rsa ├── build ├── README.txt ├── build_all.sh ├── push_all.sh └── tag_all.sh ├── deploy ├── deploy.sh ├── kill_all.sh ├── start_nameserver.sh ├── start_shell.sh └── start_spark_cluster.sh ├── dnsmasq-precise ├── Dockerfile ├── build └── files │ └── default_cmd ├── mesos ├── NOTE.txt ├── build ├── deploy │ ├── deploy │ └── start_mesos_cluster.sh ├── mesos-base │ ├── Dockerfile │ ├── build │ └── files │ │ └── configure_mesos.sh ├── mesos-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_mesos_master.sh ├── mesos-worker │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_mesos_worker.sh ├── shark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ └── default_cmd └── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── test.txt ├── shark-0.7.0 ├── build ├── shark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_shark.sh │ │ ├── hive-site.xml │ │ └── shark-env.sh ├── shark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_shark_master.sh ├── shark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── test.shark └── shark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_shark_worker.sh ├── shark-0.8.0 ├── build ├── shark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_shark.sh │ │ ├── hive-site.xml │ │ └── shark-env.sh ├── shark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_shark_master.sh ├── shark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── test.shark └── shark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_shark_worker.sh ├── spark-0.7.3 ├── build ├── spark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_spark.sh │ │ ├── log4j.properties │ │ └── spark-env.sh ├── spark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_spark_master.sh ├── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ ├── test.spark │ │ └── test.txt └── spark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_spark_worker.sh ├── spark-0.8.0 ├── NOTE.txt ├── build ├── spark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_spark.sh │ │ ├── log4j.properties │ │ └── spark-env.sh ├── spark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_spark_master.sh ├── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ ├── test.spark │ │ └── test.txt └── spark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_spark_worker.sh ├── spark-0.9.0 ├── NOTE.txt ├── build ├── spark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_spark.sh │ │ ├── log4j.properties │ │ └── spark-env.sh ├── spark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_spark_master.sh ├── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ ├── test.spark │ │ └── test.txt └── spark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_spark_worker.sh ├── spark-0.9.1 ├── NOTE.txt ├── build ├── spark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_spark.sh │ │ ├── log4j.properties │ │ └── spark-env.sh ├── spark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_spark_master.sh ├── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ ├── test.spark │ │ └── test.txt └── spark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_spark_worker.sh ├── spark-1.0.0 ├── NOTE.txt ├── build ├── spark-base │ ├── Dockerfile │ ├── build │ └── files │ │ ├── configure_spark.sh │ │ ├── log4j.properties │ │ └── spark-env.sh ├── spark-master │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ └── run_spark_master.sh ├── spark-shell │ ├── Dockerfile │ ├── build │ └── files │ │ ├── default_cmd │ │ ├── test.spark │ │ └── test.txt └── spark-worker │ ├── Dockerfile │ ├── build │ └── files │ ├── default_cmd │ └── run_spark_worker.sh └── test └── test_all.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | files.hash 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dockerfiles for Spark and Shark 2 | 3 | ## Contents 4 | 5 | Dockerfiles to build Spark and Shark images for testing and 6 | development. 7 | 8 | ## Requirements 9 | 10 | Tested on Ubuntu 12.04 (Docker version 0.6.4), Ubuntu 13.10 (Docker 0.7.0 and 0.9.0) with the virtual 11 | switch 12 | lxcbr0 13 | enabled. For running Docker on Mac and Windows see [the docs](http://docs.docker.io). 14 | Also tested inside the VirtualBox Tiny Core Linux VirtualBox VM for Docker on 15 | Mac. 16 | 17 | Note: the earlier version of the scripts had problems with newer 18 | versions of Docker (0.7). If you encounter issues please pull the 19 | latest changes from https://github.com/amplab/docker-scripts.git 20 | master branch. 21 | 22 | ## Tips for running on Mac OS 23 | If you are running on Mac OS, installed as described 24 | [in the Docker installation docs](http://docs.docker.io/en/latest/installation/mac/) 25 | you need to run all commands inside the Docker virtual machine by first ssh-ing into it: 26 | 27 |
 28 | $ ./boot2docker ssh
 29 | # User: docker
 30 | # Pwd:  tcuser
 31 | 
32 | 33 | Then make sure that `python` is installed. Otherwise install it via 34 | `tce-ab` (search for python and install `python.tcz`). Newer versions 35 | of the image that comes with boot2docker also do not have `bash` installed 36 | (install package `bash.tcz`) which is required for the deployment scripts. 37 | 38 | Further, make sure that your virtual machine running the Docker daemon and 39 | the containers has sufficient memory allocated (at least 2GB for two Spark worker 40 | containers and one master container). This can be done inside the Virtual Box 41 | GUI under the properties of the virtual machine. 42 | 43 | Finally, `boot2docker save` is a good way to perserve changes to the image 44 | between restarts of the virtual machine or host computer, 45 | for example the scripts come in the cloned git repository (see below). 46 | 47 | ## Testing 48 | 49 | First clone the repository: 50 | 51 | $ git clone https://github.com/amplab/docker-scripts.git 52 | 53 | This repository contains deploy scripts and the sources for the Docker 54 | image files, which can be easily modified. The main deploy script 55 | takes the following options. 56 | 57 |
 58 | $ sudo ./deploy/deploy.sh
 59 | usage: ./deploy.sh -i <image> [-w <#workers>] [-v <data_directory>] [-c]
 60 | 
 61 |   image:    spark or shark image from:
 62 |                  amplab/spark:0.9.0  amplab/spark:0.9.1  amplab/spark:1.0.0
 63 |                  amplab/shark:0.8.0
 64 | 
65 | 66 | The script either starts a standalone Spark cluster or a standalone 67 | Spark/Shark cluster for a given number of worker nodes. Note that 68 | on the first call it may take a while for Docker to download the 69 | various images from the repository, 70 | 71 | In addition to Spark (and Shark) the cluster also runs a Hadoop HDFS 72 | filesystem. When the deploy script is run it generates one container 73 | for the master node, one container for each worker node and one extra 74 | container running a Dnsmasq DNS forwarder. The latter one can also be 75 | used to resolve node names on the host, for example to access the 76 | worker logs via the Spark web UI. 77 | 78 | Optionally one can set the number of workers (default: 2) and a data directory 79 | which is a local path on the host that can be mounted on the master and 80 | worker containers and will appear under /data. 81 | 82 | Both the Spark and Shark shells are started in a separate container. 83 | This container can be directly started from the deploy script by 84 | passing "-c" to the deploy script. 85 | 86 | Each node (worker and master) also runs a sshd which is 87 | _pre-configured with the given RSA key_. Note that you should change 88 | this key if you plan to expose services running inside the containers. 89 | Since the permissions of the key when cloned from the repository are 90 | likely wrong you need to change them if you intend to log in with ssh: 91 | 92 |
 93 | chmod go -rwx apache-hadoop-hdfs-precise/files/id_rsa
 94 | 
95 | 96 | ### Example: Running a Spark cluster 97 | 98 | Starting from the directory in which the repository was cloned do 99 | 100 | #### Deploy the cluster 101 | 102 | $ sudo ./deploy/deploy.sh -i amplab/spark:0.9.0 -w 3 103 | 104 | #### Wait a few seconds 105 | 106 | Wait for the "cluster" to come up. Note that it can take longer to download 107 | the container images the first time but after that the process is fairly quick. 108 | When the cluster comes up you should see something like this: 109 | 110 |
111 | > sudo ./deploy.sh -i amplab/spark:0.9.0 -w 3 
112 | *** Starting Spark 0.9.0 ***
113 | starting nameserver container
114 | started nameserver container:  069557913d98a37caf43f8238dfdf181aea5ab30eb42e382db83307e277cfa9e
115 | DNS host->IP file mapped:      /tmp/dnsdir_12015/0hosts
116 | NAMESERVER_IP:                 172.17.0.8
117 | waiting for nameserver to come up 
118 | starting master container
119 | started master container:      f50a65d2ef7b17bffed7075ac2de4a7b52c26adff15bdbe14d3280ef4991c9d6
120 | MASTER_IP:                     172.17.0.9
121 | waiting for master ........
122 | waiting for nameserver to find master 
123 | starting worker container
124 | started worker container:  576d7d223f59a6da7a0e73311d1e082fad27895aef53edf3635264fb00b70258
125 | starting worker container
126 | started worker container:  5672ea896e179b51fe2f1ae5d542c35706528cd3a768ba523324f434bb2b2413
127 | starting worker container
128 | started worker container:  3cdf681f7c99c1e19f7b580ac911e139923e9caca943fd006fb633aac5b20001
129 | waiting for workers to register .....
130 | 
131 | ***********************************************************************
132 | start shell via:            sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/spark-shell:0.9.0 -n 069557913d98a37caf43f8238dfdf181aea5ab30eb42e382db83307e277cfa9e 
133 | 
134 | visit Spark WebUI at:       http://172.17.0.9:8080/
135 | visit Hadoop Namenode at:   http://172.17.0.9:50070
136 | ssh into master via:        ssh -i /home/andre/docker-scripts/deploy/../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@172.17.0.9
137 | 
138 | /data mapped:               
139 | 
140 | kill master via:           sudo docker kill f50a65d2ef7b17bffed7075ac2de4a7b52c26adff15bdbe14d3280ef4991c9d6
141 | ***********************************************************************
142 | 
143 | to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:
144 | nameserver 172.17.0.8
145 | 
146 | 147 | #### Start the Spark shell container as shown above, for example: 148 | 149 | $ sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/spark-shell:0.9.0 -n 069557913d98a37caf43f8 150 | 151 | The parameter passed with -n is the ID of the nameserver container. 152 | Then attach to the running shell via the given command, for example: 153 | 154 | $ sudo docker attach 9ac49b09bf18a13c7 155 | 156 | If the screen appears to stay blank just hit return to get to the prompt. 157 | 158 | #### Execute an example: 159 | 160 |
161 | scala> val textFile = sc.textFile("hdfs://master:9000/user/hdfs/test.txt")
162 | scala> textFile.count()
163 | scala> textFile.map({line => line}).collect()
164 | 
165 | 166 | 167 | #### Terminate the cluster: 168 | 169 | $ sudo ./deploy/kill_all.sh spark 170 | $ sudo ./deploy/kill_all.sh nameserver 171 | 172 | ### Shark 173 | 174 | Basically the same steps apply only that the Shark images are chosen instead of the Spark ones 175 | (the former contain in addition to Spark the Shark binaries). 176 | 177 | #### Deploy the cluster 178 | 179 | $ sudo ./deploy/deploy.sh -i amplab/shark:0.8.0 -w 3 180 | 181 | #### Wait a few seconds 182 | 183 | Wait for the "cluster" to come up. Note that it can take longer to download 184 | the container images the first time but after that the process is fairly quick. 185 | When the cluster comes up you should see something like this: 186 | 187 |
188 | *** Starting Shark 0.8.0 + Spark ***
189 | starting nameserver container
190 | started nameserver container:  952d22e085c3b74e829e006ab536d45d31800c463832e43d8679bbf3d703940e
191 | DNS host->IP file mapped:      /tmp/dnsdir_30578/0hosts
192 | NAMESERVER_IP:                 172.17.0.13
193 | waiting for nameserver to come up 
194 | starting master container
195 | started master container:      169f253eaddadb19b6eb28e79f148eef892f20d34602ffb42d3e57625dc61652
196 | MASTER_IP:                     172.17.0.14
197 | waiting for master ........
198 | waiting for nameserver to find master 
199 | starting worker container
200 | started worker container:  1c6920c96d5ad684a2f591bfb334323c5854cdd7a0da49982baaf77dc4d62ac7
201 | starting worker container
202 | started worker container:  7250dcfb882e2d17441c8c59361d10d8c59afb2b295719ba35f59bc72c6f17a5
203 | starting worker container
204 | started worker container:  26823e188a2a5a5897ed4b9bf0fca711dc7f98674fe62eb78fb49cf031bec79c
205 | waiting for workers to register .......
206 | 
207 | ***********************************************************************
208 | start shell via:            sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/shark-shell:0.8.0 -n 952d22e085c3b74e829e006ab536d45d31800c463832e43d8679bbf3d703940e 
209 | 
210 | visit Spark WebUI at:       http://172.17.0.14:8080/
211 | visit Hadoop Namenode at:   http://172.17.0.14:50070
212 | ssh into master via:        ssh -i /home/andre/docker-scripts/deploy/../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@172.17.0.14
213 | 
214 | /data mapped:               
215 | 
216 | kill master via:           sudo docker kill 169f253eaddadb19b6eb28e79f148eef892f20d34602ffb42d3e57625dc61652
217 | ***********************************************************************
218 | 
219 | to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:
220 | nameserver 172.17.0.13
221 | 
222 | 223 | #### Start the Shark shell container as shown above, for example: 224 | 225 | $ sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/shark-shell:0.8.0 -n 952d22e085c3b74e829e00 226 | 227 | The parameter passed with -n is the ID of the nameserver container. 228 | Then attach to the running shell via the given command, for example: 229 | 230 | $ sudo docker attach 9ac49b09bf18a13c7 231 | 232 | If the screen appears to stay blank just hit return to get to the prompt. 233 | 234 | #### Execute an example: 235 | 236 |
237 | shark> CREATE TABLE src(key INT, value STRING);
238 | shark> LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src;
239 | shark> SELECT COUNT(1) FROM src;
240 | 
241 | 242 | #### Terminate the cluster: 243 | 244 | $ sudo ./deploy/kill_all.sh shark 245 | $ sudo ./deploy/kill_all.sh nameserver 246 | 247 | ## Building 248 | 249 | If you prefer to build the images yourself (or intend to modify them) rather 250 | than downloading them from the Docker repository, you can build 251 | all Spark and Shark images in the correct order via the build script: 252 | 253 | $ ./build/build_all.sh 254 | 255 | The script builds the images in an order that satisfies the chain of 256 | dependencies: 257 | 258 | apache-hadoop-hdfs-precise -> spark-base -> spark-{master, worker, shell} 259 | 260 | apache-hadoop-hdfs-precise -> spark-base -> shark-base -> shark-{master, worker, shell} 261 | 262 | You can always (re-)build single images by cd-ing into the image directory and doing 263 | 264 | $ . build 265 | 266 | ## Best practices for Dockerfiles and startup scripts 267 | 268 | The following are just some comments that made the generation of the images easier. It 269 | is not enforced in any way by Docker. 270 | 271 | The images and startup scripts follow the following structure in order to reuse 272 | as much as possible of the image they depend on. There are two types of images, 273 | base images and leaf images. Leaf images, as the name suggests, 274 | are images that are leafs in the dependency tree. For example, spark-base as a base 275 | image depends on apache-hadoop-hdfs-precise. spark-master depends on spark-base as 276 | its base image and is itself a leaf. 277 | 278 | In addition to its Dockerfile, each image has a 279 | files/ 280 | subdirectory in its image directory that contains files (config files, data files) that will be copied 281 | to the 282 | root/image_name_files 283 | directory inside the image. 284 | 285 | ### Base images 286 | 287 | Base images are images that are intended to be extended by other images and therefore do not 288 | have a default command or entry point. They are good for testing though, e.g, by running 289 | /bin/bash 290 | inside them. 291 | 292 | 293 | For base images such as spark-base, besides data files the 294 | files/ 295 | directory also contains 296 | files/configure_spark.sh 297 | which is a script that contains four functions 298 | 299 | * create_spark_directories 300 | for creating required directories such as the working directory 301 | * deploy_spark_files 302 | that would copy files from 303 | /root/image_name_files 304 | to required system path locations 305 | * configure_spark 306 | that changes settings in config files and takes the IP of the master as argument 307 | * prepare_spark 308 | that calls the previous three in the given order and takes the IP of the master as argument 309 | 310 | 311 | All of the functions of a __base-image__'s configure script, so also inside 312 | files/configure_spark.sh 313 | except __prepare_spark__ first call their corresponding functions in the image the spark-base image depends on (apache-hadoop-hdfs-precise in this case). Therefore all the underlying services get initialized before the top level service. 314 | 315 | ### Leaf images 316 | 317 | For leaf images such as spark-master, besides data files the 318 | files/ 319 | directory also contains 320 | files/default_cmd 321 | that is chosen in the image's Dockerfile to be the default command (or entry point) to the image. This means the command 322 | inside is executed whenever the container is started. 323 | 324 | 325 | The default command script executes the following steps in this order 326 | 327 | 1. The first thing the default command does is call the prepare 328 | function of the configure script inside its base image. In this case, the default command script calls function 329 | prepare_spark 330 | inside 331 | /root/spark-base/configure_spark.sh 332 | which is the location the configure script of spark-base was copied to. 333 | 2. After that, now that the base images configuration (and the configuration of the images it inherits from) has completed, the 334 | default command may start services it relies on, such as the Hadoop namenode service in the case of spark-master. 335 | 3. Finally, the default command script of spark-master runs a second script under userid hdfs 336 | (the Hadoop HDFS super user), which is 337 | files/files/run_spark_master.sh 338 | that actually starts the master. 339 | 340 | 341 | The spark-worker default command proceeds along the same lines but starts a Spark worker with a Hadoop datanode instead. 342 | 343 | ## Tips 344 | 345 | ### Name resolution on host 346 | 347 | In order to resolve names (such as "master", "worker1", etc.) add the IP 348 | of the nameserver container to the top of /etc/resolv.conf on the host. 349 | 350 | ### Maintaining local Docker image repository 351 | 352 | After a while building and debugging images the local image repository gets 353 | full of intermediate images that serve no real purpose other than 354 | debugging a broken build. To remove these do 355 | 356 | $ sudo docker images | grep "" | awk '{print $3}' | xargs sudo docker rmi 357 | 358 | Also data from stopped containers tend to accumulate. In order to remove all container data (__only do when no containers are running__) do 359 | 360 | $ sudo docker rm `sudo docker ps -a -q` 361 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base Ubuntu Precise 12.04 LTS image 2 | # 3 | FROM ubuntu:precise 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 5 | 6 | # Setup a volume for data 7 | VOLUME ["/data"] 8 | 9 | # Set correct source list 10 | RUN echo "deb http://archive.ubuntu.com/ubuntu precise main universe" > /etc/apt/sources.list 11 | RUN echo "deb http://archive.ubuntu.com/ubuntu precise-updates main universe" >> /etc/apt/sources.list 12 | 13 | # install a few other useful packages plus Open Jdk 7 14 | RUN apt-get update && apt-get upgrade -y && apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server iputils-ping python2.7 15 | 16 | # Install Hadoop 17 | ADD http://mirror.sdunix.com/apache/hadoop/common/hadoop-1.2.1/hadoop_1.2.1-1_x86_64.deb /root/ 18 | RUN dpkg -i /root/hadoop_1.2.1-1_x86_64.deb && rm /root/hadoop_1.2.1-1_x86_64.deb 19 | 20 | # Docker messes up /etc/hosts and adds two entries for 127.0.0.1 21 | # we try to recover from that by giving /etc/resolv.conf and therefore 22 | # the nameserver priority 23 | RUN sed -i s/"files dns"/"dns files"/ /etc/nsswitch.conf 24 | 25 | # add Hadoop config file templates 26 | ADD files /root/hadoop_files 27 | 28 | # Set JAVA_HOME 29 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64 30 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}apache-hadoop-hdfs-precise:1.2.1 . 5 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/files/authorized_keys: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDp2atNK3bux0z3d2Aojkl231Lf6X7HZUYIBt3XzUs+wnTzzB/eH2ubS5Wdwyy5daA4itsvX6hI1o/LQOfRBdjXqIVl+IFXFdwNQ0saCSNh65O2ynuMwsxUXhBJAGoBg6sTXq1ZPNQk1JqopUBP6+H4jpnKFW3JosON9QopQdkkYIz/frHs3HojfbydQesGNovanKrGYV3QeFVQDPxseufRZtHjrTk1hQ3FEayQCTyqJ8JDE6DMrirNEVBTuuNZ/Z2afPLWcZIKQ46E73p9HhqcaWEph6xQ3Ha/WV9oK0jenfz4b+sGrUItTbzuP8SsUiA4yZrZaN4BubDi4oPALOr/ root@423e412aa505 2 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/files/configure_hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | hadoop_files=( "/root/hadoop_files/core-site.xml" "/root/hadoop_files/hdfs-site.xml" ) 4 | 5 | function create_hadoop_directories() { 6 | rm -rf /root/.ssh 7 | mkdir /root/.ssh 8 | chmod go-rx /root/.ssh 9 | mkdir /var/run/sshd 10 | } 11 | 12 | function deploy_hadoop_files() { 13 | for i in "${hadoop_files[@]}"; 14 | do 15 | filename=$(basename $i); 16 | cp $i /etc/hadoop/$filename; 17 | done 18 | cp /root/hadoop_files/id_rsa /root/.ssh 19 | chmod go-rwx /root/.ssh/id_rsa 20 | cp /root/hadoop_files/authorized_keys /root/.ssh/authorized_keys 21 | chmod go-wx /root/.ssh/authorized_keys 22 | } 23 | 24 | function configure_hadoop() { 25 | sed -i s/__MASTER__/$1/ /etc/hadoop/core-site.xml 26 | sed -i s/"JAVA_HOME=\/usr\/lib\/jvm\/java-6-sun"/"JAVA_HOME=\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /etc/hadoop/hadoop-env.sh 27 | } 28 | 29 | function prepare_hadoop() { 30 | create_hadoop_directories 31 | deploy_hadoop_files 32 | configure_hadoop $1 33 | } 34 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/files/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.default.name 4 | hdfs://__MASTER__:9000 5 | 6 | 7 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/files/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dfs.replication 4 | 1 5 | 6 | 7 | -------------------------------------------------------------------------------- /apache-hadoop-hdfs-precise/files/id_rsa: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEpAIBAAKCAQEA6dmrTSt27sdM93dgKI5Jdt9S3+l+x2VGCAbd181LPsJ088wf 3 | 3h9rm0uVncMsuXWgOIrbL1+oSNaPy0Dn0QXY16iFZfiBVxXcDUNLGgkjYeuTtsp7 4 | jMLMVF4QSQBqAYOrE16tWTzUJNSaqKVAT+vh+I6ZyhVtyaLDjfUKKUHZJGCM/36x 5 | 7Nx6I328nUHrBjaL2pyqxmFd0HhVUAz8bHrn0WbR4605NYUNxRGskAk8qifCQxOg 6 | zK4qzRFQU7rjWf2dmnzy1nGSCkOOhO96fR4anGlhKYesUNx2v1lfaCtI3p38+G/r 7 | Bq1CLU287j/ErFIgOMma2WjeAbmw4uKDwCzq/wIDAQABAoIBAQCBgFZZ/Pj3EI2x 8 | +XzZ2LocR144u7DGsXHP3iWabYj+72ce3+rB8np/3KK1ZDFvXxFkXpk1Ke8irxeg 9 | gogd+/PysdN1/eF6nZNoEN0VRPxALNp3frhe4j2PdyvjkYQi5IynxGWRJpuA7e/b 10 | 9u+fksxn/mhyPd23rRhIk+uVn26lsnccHhCkfqr+Szm/xFsTUhYQ1B8bfrqhA1Le 11 | WRrBa03JXocd2y3TdzeaQ+AtvbpAy9Fc28N7xkDsuh+H1y74jRhFzBXd4WnYuxze 12 | /PAD3hpgtCDGGnGpwE2SMM8fZJ7vLOPAsMUuz1tvLbKcoTTdaUw4fBur/XQHloW7 13 | k7adoW6BAoGBAP0bdE1uynnwZOFDhmpMvdYfodwlv3Far+QZwVroSa64YWBaeAef 14 | v0AO75p/EiQJEGWB9bgOAyrbOFdRqLtUF14lQw4ZLUV7sQu/o2Z0sVMSRCVWuNDf 15 | W8sk74RtH3WB7lutOMP3WyYopOUZtTK1rZrRNxD4+edq7+utAba+DLS/AoGBAOyF 16 | 31hype9DkOHgD/jWU7tNrJprLkNkSHe/Aq5JdKesgw84AOSKO4W1/uXOly4VOt6Z 17 | 54eeW1gt+uKT292GEl66TO8PIxszfsUzpYpTKkSzrl5OsM9hUlitJwpff/D9Mbxw 18 | fZWt0EjKlBQWc83sMBwCe8ZyNh/WueBIKH5HjhnBAoGAEwFRvVK5X2iemo+Qc0Dp 19 | 7D8Zz0cCVgeiN3V7oFDa34S2wx5n7uKe4Ld+ZFJwUUZg9c5JXhWnRTuKwnu+OLq6 20 | unX/z/ox/Qqpo6EzKslOW1d+yHL3k6+B3AIc/guXliI4fKfIIGbdcEMTBqTkhzc/ 21 | HuXgxaR8V1UfSMoH2+nvWE8CgYAcw4MP3JF1cYATGA6ZMmdoZd/Rv6sWowF1HpOS 22 | 4nf/VCl0Fll1caIfdqyTAfa8sfRA0fKoOYfeR2k1WMnqPL3LK1jj0bFxQ2ftT4SY 23 | N9jyFe/kpCk4bxt2kUgoKMkEY6ZCxmNfao3j7E7pynk217xaC6tFzOnsIU7liaDz 24 | CnyrgQKBgQDtjairs6ehaqRu8Uk44gQoNIlReJ8qp7YmfPlK8ylFNTALs37c4308 25 | Qbjp+jLt7w+XMYnNaZPSNN1mt6EyWFSqUc+5QbfQpbw1cZRI1UBIQDwJjZUS04Ou 26 | H75Rif72nQxHh9Ly5CMNCEyioin7kq945vQbyAwyEr7+tomhUZaq9g== 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /build/README.txt: -------------------------------------------------------------------------------- 1 | Building and publishing images to the amplab account: 2 | 1. make sure IMAGE_PREFIX="" (see build_all.sh) 3 | 2. build_all.sh 4 | 3. set IMAGE_PREFIX="amplab/" 5 | 4. build_all.sh 6 | 5. tag_all.sh 7 | 6. push_all.sh 8 | -------------------------------------------------------------------------------- /build/build_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "$USER" != "root" ]]; then 4 | echo "please run as: sudo $0" 5 | exit 1 6 | fi 7 | 8 | CURDIR=$(pwd) 9 | BASEDIR=$(cd $(dirname $0); pwd)"/.." 10 | dir_list=( "dnsmasq-precise" "apache-hadoop-hdfs-precise" "spark-0.7.3" "shark-0.7.0" "spark-0.8.0" "spark-0.9.0" "shark-0.8.0" ) 11 | 12 | export IMAGE_PREFIX="" 13 | #"amplab/" 14 | 15 | # NOTE: the order matters but this is the right one 16 | for i in ${dir_list[@]}; do 17 | echo building $i; 18 | cd ${BASEDIR}/$i 19 | cat build 20 | ./build 21 | done 22 | cd $CURDIR 23 | -------------------------------------------------------------------------------- /build/push_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "$USER" != "root" ]]; then 4 | echo "please run as: sudo $0" 5 | exit 1 6 | fi 7 | 8 | image_list=( "apache-hadoop-hdfs-precise" "dnsmasq-precise" "spark-master" "spark-worker" "spark-shell" "shark-master" "shark-worker" "shark-shell" ) 9 | 10 | IMAGE_PREFIX="amplab/" 11 | 12 | # NOTE: the order matters but this is the right one 13 | for i in ${image_list[@]}; do 14 | echo docker push ${IMAGE_PREFIX}${i} 15 | docker push ${IMAGE_PREFIX}${i} 16 | done 17 | -------------------------------------------------------------------------------- /build/tag_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "$USER" != "root" ]]; then 4 | echo "please run as: sudo $0" 5 | exit 1 6 | fi 7 | 8 | image_list=("spark-master:0.9.0" "spark-worker:0.9.0" "spark-shell:0.9.0" "shark-master:0.8.0" "shark-worker:0.8.0" "shark-shell:0.8.0" ) 9 | 10 | IMAGE_PREFIX="amplab/" 11 | 12 | # NOTE: the order matters but this is the right one 13 | for i in ${image_list[@]}; do 14 | image=$(echo $i | awk -F ":" '{print $1}') 15 | echo docker tag ${IMAGE_PREFIX}${i} ${IMAGE_PREFIX}${image}:latest 16 | docker tag ${IMAGE_PREFIX}${i} ${IMAGE_PREFIX}${image}:latest 17 | done 18 | -------------------------------------------------------------------------------- /deploy/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEBUG=0 4 | BASEDIR=$(cd $(dirname $0); pwd) 5 | 6 | spark_images=( "amplab/spark:0.9.0" "amplab/spark:0.9.1" "amplab/spark:1.0.0") 7 | shark_images=( "amplab/shark:0.8.0" ) 8 | NAMESERVER_IMAGE="amplab/dnsmasq-precise" 9 | 10 | start_shell=0 11 | VOLUME_MAP="" 12 | 13 | image_type="?" 14 | image_version="?" 15 | NUM_WORKERS=2 16 | 17 | source $BASEDIR/start_nameserver.sh 18 | source $BASEDIR/start_spark_cluster.sh 19 | 20 | function check_root() { 21 | if [[ "$USER" != "root" ]]; then 22 | echo "please run as: sudo $0" 23 | exit 1 24 | fi 25 | } 26 | 27 | function print_help() { 28 | echo "usage: $0 -i [-w <#workers>] [-v ] [-c]" 29 | echo "" 30 | echo " image: spark or shark image from:" 31 | echo -n " " 32 | for i in ${spark_images[@]}; do 33 | echo -n " $i" 34 | done 35 | echo "" 36 | echo -n " " 37 | for i in ${shark_images[@]}; do 38 | echo -n " $i" 39 | done 40 | echo "" 41 | } 42 | 43 | function parse_options() { 44 | while getopts "i:w:cv:h" opt; do 45 | case $opt in 46 | i) 47 | echo "$OPTARG" | grep "spark:" > /dev/null; 48 | if [ "$?" -eq 0 ]; then 49 | image_type="spark" 50 | fi 51 | echo "$OPTARG" | grep "shark:" > /dev/null; 52 | if [ "$?" -eq 0 ]; then 53 | image_type="shark" 54 | fi 55 | image_name=$(echo "$OPTARG" | awk -F ":" '{print $1}') 56 | image_version=$(echo "$OPTARG" | awk -F ":" '{print $2}') 57 | ;; 58 | w) 59 | NUM_WORKERS=$OPTARG 60 | ;; 61 | h) 62 | print_help 63 | exit 0 64 | ;; 65 | c) 66 | start_shell=1 67 | ;; 68 | v) 69 | VOLUME_MAP=$OPTARG 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$image_type" == "?" ]; then 75 | echo "missing or invalid option: -i " 76 | exit 1 77 | fi 78 | 79 | if [ ! "$VOLUME_MAP" == "" ]; then 80 | echo "data volume chosen: $VOLUME_MAP" 81 | VOLUME_MAP="-v $VOLUME_MAP:/data" 82 | fi 83 | } 84 | 85 | check_root 86 | 87 | if [[ "$#" -eq 0 ]]; then 88 | print_help 89 | exit 1 90 | fi 91 | 92 | parse_options $@ 93 | 94 | if [ "$image_type" == "spark" ]; then 95 | SPARK_VERSION="$image_version" 96 | echo "*** Starting Spark $SPARK_VERSION ***" 97 | elif [ "$image_type" == "shark" ]; then 98 | SHARK_VERSION="$image_version" 99 | # note: we currently don't have a Shark 0.9 image but it's safe Spark 100 | # to Shark's version for all but Shark 0.7.0 101 | if [ "$SHARK_VERSION" == "0.9.0" ] || [ "$SHARK_VERSION" == "0.8.0" ]; then 102 | SPARK_VERSION="$SHARK_VERSION" 103 | else 104 | SPARK_VERSION="0.7.3" 105 | fi 106 | echo "*** Starting Shark $SHARK_VERSION + Spark ***" 107 | else 108 | echo "not starting anything" 109 | exit 0 110 | fi 111 | 112 | start_nameserver $NAMESERVER_IMAGE 113 | wait_for_nameserver 114 | start_master ${image_name}-master $image_version 115 | wait_for_master 116 | if [ "$image_type" == "spark" ]; then 117 | SHELLCOMMAND="sudo $BASEDIR/start_shell.sh -i ${image_name}-shell:$SPARK_VERSION -n $NAMESERVER $VOLUME_MAP" 118 | elif [ "$image_type" == "shark" ]; then 119 | SHELLCOMMAND="sudo $BASEDIR/start_shell.sh -i ${image_name}-shell:$SHARK_VERSION -n $NAMESERVER $VOLUME_MAP" 120 | fi 121 | 122 | start_workers ${image_name}-worker $image_version 123 | get_num_registered_workers 124 | echo -n "waiting for workers to register " 125 | until [[ "$NUM_REGISTERED_WORKERS" == "$NUM_WORKERS" ]]; do 126 | echo -n "." 127 | sleep 1 128 | get_num_registered_workers 129 | done 130 | echo "" 131 | print_cluster_info "$SHELLCOMMAND" 132 | if [[ "$start_shell" -eq 1 ]]; then 133 | SHELL_ID=$($SHELLCOMMAND | tail -n 1 | awk '{print $4}') 134 | sudo docker attach $SHELL_ID 135 | fi 136 | -------------------------------------------------------------------------------- /deploy/kill_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function kill_containers() { 4 | containers=($1) 5 | for i in "${containers[@]}"; do 6 | echo "killing container $i" 7 | sudo docker kill "$i" 8 | done 9 | } 10 | 11 | if [ "$#" -ne "1" ]; then 12 | echo -e "usage:\n $0 spark\n $0 shark\n $0 mesos\n $0 nameserver" 13 | exit 1; 14 | fi 15 | 16 | if [[ "$USER" != "root" ]]; then 17 | echo "please run as: sudo $0" 18 | exit 1 19 | fi 20 | 21 | clustertype=$1 22 | 23 | if [[ "$clustertype" == "nameserver" ]]; then 24 | nameserver=$(sudo docker ps | grep dnsmasq_files | awk '{print $1}' | tr '\n' ' ') 25 | kill_containers "$nameserver" 26 | else 27 | master=$(sudo docker ps | grep ${clustertype}_master | awk '{print $1}' | tr '\n' ' ') 28 | workers=$(sudo docker ps | grep ${clustertype}_worker | awk '{print $1}' | tr '\n' ' ') 29 | shells=$(sudo docker ps | grep ${clustertype}_shell | awk '{print $1}' | tr '\n' ' ') 30 | kill_containers "$master" 31 | kill_containers "$workers" 32 | kill_containers "$shells" 33 | fi 34 | 35 | -------------------------------------------------------------------------------- /deploy/start_nameserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAMESERVER=-1 4 | NAMESERVER_IP= 5 | DOMAINNAME= 6 | #".mycluster.com" 7 | 8 | # starts the dnsmasq nameserver 9 | function start_nameserver() { 10 | DNSDIR="/tmp/dnsdir_$RANDOM" 11 | DNSFILE="${DNSDIR}/0hosts" 12 | mkdir $DNSDIR 13 | 14 | echo "starting nameserver container" 15 | if [ "$DEBUG" -gt 0 ]; then 16 | echo sudo docker run -d -h nameserver${DOMAINNAME} -v $DNSDIR:/etc/dnsmasq.d $1 17 | fi 18 | NAMESERVER=$(sudo docker run -d -h nameserver${DOMAINNAME} -v $DNSDIR:/etc/dnsmasq.d $1) 19 | 20 | if [ "$NAMESERVER" = "" ]; then 21 | echo "error: could not start nameserver container from image $1" 22 | exit 1 23 | fi 24 | 25 | echo "started nameserver container: $NAMESERVER" 26 | echo "DNS host->IP file mapped: $DNSFILE" 27 | sleep 2 28 | NAMESERVER_IP=$(sudo docker logs $NAMESERVER 2>&1 | egrep '^NAMESERVER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .") 29 | echo "NAMESERVER_IP: $NAMESERVER_IP" 30 | echo "address=\"/nameserver/$NAMESERVER_IP\"" > $DNSFILE 31 | } 32 | 33 | # contact nameserver container and resolve IP address (used for checking whether nameserver has registered 34 | # presence of new container). note: only returns exit code 35 | function check_hostname() { 36 | local __resultvar=$1 37 | local val_hostname=$2 38 | local val_expected_ip=$3 39 | if which dig >/dev/null; then 40 | DNSCMD="dig $val_hostname @${NAMESERVER_IP} | grep ANSWER -A1 | grep $val_expected_ip > /dev/null" 41 | else 42 | DNSCMD="nslookup $val_hostname $NAMESERVER_IP | grep Address | tail -n 1 | grep $val_expected_ip > /dev/null" 43 | fi 44 | #echo "DNSCMD: $DNSCMD" 45 | eval $DNSCMD 46 | eval $__resultvar=$? 47 | } 48 | 49 | # contact nameserver container and resolve IP address 50 | function resolve_hostname() { 51 | local __resultvar=$1 52 | local val_hostname=$2 53 | if which dig >/dev/null; then 54 | DNSCMD="dig $val_hostname @${NAMESERVER_IP} | grep ANSWER -A1 | tail -n 1 | awk '{print \$5}'" 55 | else 56 | DNSCMD="nslookup $val_hostname $NAMESERVER_IP | grep Address | tail -n 1 | awk -F":" '{print \$2}' | awk '{print \$1}'" 57 | fi 58 | #echo "DNSCMD: $DNSCMD" 59 | tmpval=$(eval "$DNSCMD") 60 | eval $__resultvar="$tmpval" 61 | } 62 | 63 | function wait_for_nameserver { 64 | echo -n "waiting for nameserver to come up " 65 | # Note: the original scripts assumed the nameserver resolves its own 66 | # hostname to 127.0.0.1 67 | # With newer versions of Docker that is not necessarily the case anymore. 68 | # Thanks to bmustafa (24601 on GitHub) for reporting and proposing a fix! 69 | check_hostname result nameserver "$NAMESERVER_IP" 70 | until [ "$result" -eq 0 ]; do 71 | echo -n "." 72 | sleep 1 73 | check_hostname result nameserver "$NAMESERVER_IP" 74 | done 75 | echo "" 76 | } 77 | -------------------------------------------------------------------------------- /deploy/start_shell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(cd $(dirname $0); pwd) 4 | source $BASEDIR/start_nameserver.sh 5 | 6 | SHELL_ID=-1 7 | SHELL_IP= 8 | NAMESERVER_IP= 9 | NAMESERVER_DIR= 10 | NAMESERVER_ID=-1 11 | 12 | image_type="?" 13 | 14 | DEBUG=1 15 | 16 | # TODO: remove redundant image list definition (source from file common to deploy.sh) 17 | spark_shell_images=( "amplab/spark-shell:0.9.0" "amplab/spark-shell:0.9.1" "amplab/spark-shell:1.0.0") 18 | shark_shell_images=( "amplab/shark-shell:0.8.0" ) 19 | 20 | # TODO: unify with deploy.sh 21 | function check_root() { 22 | if [[ "$USER" != "root" ]]; then 23 | echo "please run as: sudo $0" 24 | exit 1 25 | fi 26 | } 27 | 28 | function print_help() { 29 | echo "usage: $0 -i -n [-v ]" 30 | echo "" 31 | echo " image: spark or shark image from:" 32 | echo -n " " 33 | for i in ${spark_shell_images[@]}; do 34 | echo -n " $i" 35 | done 36 | echo "" 37 | echo -n " " 38 | for i in ${shark_shell_images[@]}; do 39 | echo -n " $i" 40 | done 41 | echo "" 42 | } 43 | 44 | function parse_options() { 45 | while getopts "i:n:v:h" opt; do 46 | case $opt in 47 | i) 48 | echo "$OPTARG" | grep "spark-shell:" > /dev/null; 49 | if [ "$?" -eq 0 ]; then 50 | image_type="spark" 51 | fi 52 | echo "$OPTARG" | grep "shark-shell:" > /dev/null; 53 | if [ "$?" -eq 0 ]; then 54 | image_type="shark" 55 | fi 56 | image_name=$(echo "$OPTARG" | awk -F ":" '{print $1}') 57 | image_version=$(echo "$OPTARG" | awk -F ":" '{print $2}') 58 | ;; 59 | h) 60 | print_help 61 | exit 0 62 | ;; 63 | v) 64 | VOLUME_MAP=$OPTARG 65 | ;; 66 | n) 67 | NAMESERVER_ID=$OPTARG 68 | ;; 69 | esac 70 | done 71 | 72 | if [ "$image_type" == "?" ]; then 73 | echo "missing or invalid option: -i " 74 | exit 1 75 | fi 76 | 77 | if [ ! "$VOLUME_MAP" == "" ]; then 78 | echo "data volume chosen: $VOLUME_MAP" 79 | VOLUME_MAP="-v $VOLUME_MAP:/data" 80 | fi 81 | } 82 | 83 | # TODO: generalize and refactor this with the code for updating 84 | # master and worker nameserver entries. 85 | function set_nameserver_data() { 86 | IMAGENAME="$image_name:$image_version" 87 | DNSDIR=$(sudo docker inspect $NAMESERVER_ID | \ 88 | grep dnsdir | awk '{print $2}' | tr -d '":') 89 | DNSFILE="${DNSDIR}/0hosts" 90 | SHELL_IP=$(docker inspect $SHELL_ID | \ 91 | grep IPAddress | awk '{print $2}' | tr -d '":,') 92 | 93 | if [ "$DEBUG" -gt 0 ]; then 94 | echo "NAMESERVER_IP: $NAMESERVER_IP" 95 | echo "DNSFILE: $DNSFILE" 96 | echo "SHELL_IP: $SHELL_IP" 97 | echo "SHELL_HOSTNAME: $SHELL_HOSTNAME" 98 | fi 99 | 100 | echo "address=\"/$SHELL_HOSTNAME/$SHELL_IP\"" | sudo tee -a $DNSFILE > /dev/null 101 | } 102 | 103 | # starts the spark/shark shell container 104 | function start_shell() { 105 | IMAGENAME="$image_name:$image_version" 106 | NAMESERVER_IP=$(docker inspect $NAMESERVER_ID | \ 107 | grep IPAddress | awk '{print $2}' | tr -d '":,') 108 | 109 | if [ "$NAMESERVER_IP" = "" ]; then 110 | echo "error: cannot determine nameserver IP" 111 | exit 1 112 | fi 113 | 114 | #MASTER_IP=$(dig master @$NAMESERVER_IP | grep ANSWER -A1 | \ 115 | # tail -n 1 | awk '{print $5}') 116 | resolve_hostname MASTER_IP master 117 | 118 | if [ "$MASTER_IP" = "" ]; then 119 | echo "error: cannot determine master IP" 120 | exit 1 121 | fi 122 | 123 | SHELL_HOSTNAME="shell$RANDOM" 124 | echo "starting shell container" 125 | if [ "$DEBUG" -gt 0 ]; then 126 | echo sudo docker run -i -t -d --dns $NAMESERVER_IP -h $SHELL_HOSTNAME $VOLUME_MAP $IMAGENAME $MASTER_IP 127 | fi 128 | SHELL_ID=$(sudo docker run -i -t -d --dns $NAMESERVER_IP -h $SHELL_HOSTNAME $VOLUME_MAP $IMAGENAME $MASTER_IP) 129 | 130 | if [ "$SHELL_ID" = "" ]; then 131 | echo "error: could not start shell container from image $IMAGENAME" 132 | exit 1 133 | fi 134 | } 135 | 136 | check_root 137 | 138 | if [[ "$#" -eq 0 ]]; then 139 | print_help 140 | exit 1 141 | fi 142 | 143 | parse_options $@ 144 | 145 | if [ "$image_type" == "spark" ]; then 146 | SPARK_VERSION="$image_version" 147 | echo "*** Starting Spark $SPARK_VERSION Shell ***" 148 | elif [ "$image_type" == "shark" ]; then 149 | SHARK_VERSION="$image_version" 150 | # note: we currently don't have a Shark 0.9 image but it's safe Spark 151 | # to Shark's version for all but Shark 0.7.0 152 | if [ "$SHARK_VERSION" == "0.9.0" ] || [ "$SHARK_VERSION" == "0.8.0" ]; then 153 | SPARK_VERSION="$SHARK_VERSION" 154 | else 155 | SPARK_VERSION="0.7.3" 156 | fi 157 | echo "*** Starting Shark $SHARK_VERSION + Spark Shell ***" 158 | else 159 | echo "not starting anything" 160 | exit 0 161 | fi 162 | 163 | start_shell 164 | 165 | sleep 2 166 | 167 | set_nameserver_data 168 | 169 | echo -n "waiting for nameserver to find shell " 170 | SHELL_IP=$(docker inspect $SHELL_ID | \ 171 | grep IPAddress | awk '{print $2}' | tr -d '":,') 172 | 173 | check_hostname result $SHELL_HOSTNAME $SHELL_IP 174 | until [ "$result" -eq 0 ]; do 175 | echo -n "." 176 | sleep 1 177 | check_hostname result $SHELL_HOSTNAME $SHELL_IP 178 | done 179 | 180 | echo "" 181 | echo "***************************************************************" 182 | echo "connect to shell via:" 183 | echo "sudo docker attach $SHELL_ID" 184 | 185 | -------------------------------------------------------------------------------- /deploy/start_spark_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MASTER=-1 4 | MASTER_IP= 5 | NUM_REGISTERED_WORKERS=0 6 | 7 | # starts the Spark/Shark master container 8 | function start_master() { 9 | echo "starting master container" 10 | if [ "$DEBUG" -gt 0 ]; then 11 | echo sudo docker run -d --dns $NAMESERVER_IP -h master${DOMAINNAME} $VOLUME_MAP $1:$2 12 | fi 13 | MASTER=$(sudo docker run -d --dns $NAMESERVER_IP -h master${DOMAINNAME} $VOLUME_MAP $1:$2) 14 | 15 | if [ "$MASTER" = "" ]; then 16 | echo "error: could not start master container from image $1:$2" 17 | exit 1 18 | fi 19 | 20 | echo "started master container: $MASTER" 21 | sleep 3 22 | MASTER_IP=$(sudo docker logs $MASTER 2>&1 | egrep '^MASTER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .") 23 | echo "MASTER_IP: $MASTER_IP" 24 | echo "address=\"/master/$MASTER_IP\"" >> $DNSFILE 25 | } 26 | 27 | # starts a number of Spark/Shark workers 28 | function start_workers() { 29 | for i in `seq 1 $NUM_WORKERS`; do 30 | echo "starting worker container" 31 | hostname="worker${i}${DOMAINNAME}" 32 | if [ "$DEBUG" -gt 0 ]; then 33 | echo sudo docker run -d --dns $NAMESERVER_IP -h $hostname $VOLUME_MAP $1:$2 ${MASTER_IP} 34 | fi 35 | WORKER=$(sudo docker run -d --dns $NAMESERVER_IP -h $hostname $VOLUME_MAP $1:$2 ${MASTER_IP}) 36 | 37 | if [ "$WORKER" = "" ]; then 38 | echo "error: could not start worker container from image $1:$2" 39 | exit 1 40 | fi 41 | 42 | echo "started worker container: $WORKER" 43 | sleep 3 44 | WORKER_IP=$(sudo docker logs $WORKER 2>&1 | egrep '^WORKER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .") 45 | echo "address=\"/$hostname/$WORKER_IP\"" >> $DNSFILE 46 | done 47 | } 48 | 49 | # prints out information on the cluster 50 | function print_cluster_info() { 51 | BASEDIR=$(cd $(dirname $0); pwd)"/.." 52 | echo "" 53 | echo "***********************************************************************" 54 | echo "start shell via: $1" 55 | echo "" 56 | echo "visit Spark WebUI at: http://$MASTER_IP:8080/" 57 | echo "visit Hadoop Namenode at: http://$MASTER_IP:50070" 58 | echo "ssh into master via: ssh -i $BASEDIR/apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${MASTER_IP}" 59 | echo "" 60 | echo "/data mapped: $VOLUME_MAP" 61 | echo "" 62 | echo "kill master via: sudo docker kill $MASTER" 63 | echo "***********************************************************************" 64 | echo "" 65 | echo "to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:" 66 | echo "nameserver $NAMESERVER_IP" 67 | } 68 | 69 | function get_num_registered_workers() { 70 | if [[ "$SPARK_VERSION" == "0.7.3" ]]; then 71 | DATA=$( curl --noproxy -s http://$MASTER_IP:8080/?format=json | tr -d '\n' | sed s/\"/\\\\\"/g) 72 | else 73 | # Docker on Mac uses tinycore Linux with busybox which has a limited version wget (?) 74 | echo $(uname -a) | grep "Linux boot2docker" > /dev/null 75 | if [[ "$?" == "0" ]]; then 76 | DATA=$( wget -Y off -q -O - http://$MASTER_IP:8080/json | tr -d '\n' | sed s/\"/\\\\\"/g) 77 | else 78 | DATA=$( wget --no-proxy -q -O - http://$MASTER_IP:8080/json | tr -d '\n' | sed s/\"/\\\\\"/g) 79 | fi 80 | fi 81 | NUM_REGISTERED_WORKERS=$(python -c "import json; data = \"$DATA\"; value = json.loads(data); print len(value['workers'])") 82 | } 83 | 84 | function wait_for_master { 85 | if [[ "$SPARK_VERSION" == "0.7.3" ]]; then 86 | query_string="INFO HttpServer: akka://sparkMaster/user/HttpServer started" 87 | elif [[ "$SPARK_VERSION" == "1.0.0" ]]; then 88 | query_string="MasterWebUI: Started MasterWebUI" 89 | else 90 | query_string="MasterWebUI: Started Master web UI" 91 | fi 92 | echo -n "waiting for master " 93 | sudo docker logs $MASTER | grep "$query_string" > /dev/null 94 | until [ "$?" -eq 0 ]; do 95 | echo -n "." 96 | sleep 1 97 | sudo docker logs $MASTER | grep "$query_string" > /dev/null; 98 | done 99 | echo "" 100 | echo -n "waiting for nameserver to find master " 101 | check_hostname result master "$MASTER_IP" 102 | until [ "$result" -eq 0 ]; do 103 | echo -n "." 104 | sleep 1 105 | check_hostname result master "$MASTER_IP" 106 | done 107 | echo "" 108 | sleep 3 109 | } 110 | -------------------------------------------------------------------------------- /dnsmasq-precise/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:precise 2 | 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | VOLUME [ "/etc/dnsmasq.d" ] 6 | 7 | RUN apt-get install -y dnsmasq-base 8 | 9 | RUN echo "user=root" > /etc/dnsmasq.conf 10 | RUN echo "listen-address=__LOCAL_IP__" >> /etc/dnsmasq.conf 11 | RUN echo "resolv-file=/etc/resolv.dnsmasq.conf" >> /etc/dnsmasq.conf 12 | RUN echo "conf-dir=/etc/dnsmasq.d" >> /etc/dnsmasq.conf 13 | RUN echo "domain=cluster.com" >> /etc/dnsmasq.conf 14 | 15 | RUN echo "nameserver 8.8.8.8" >> /etc/resolv.dnsmasq.conf 16 | 17 | ADD files /root/dnsmasq_files 18 | 19 | CMD ["/root/dnsmasq_files/default_cmd"] 20 | -------------------------------------------------------------------------------- /dnsmasq-precise/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}dnsmasq-precise . 5 | -------------------------------------------------------------------------------- /dnsmasq-precise/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 4 | echo "NAMESERVER_IP=$IP" 5 | 6 | sed -i s/__LOCAL_IP__/$IP/ /etc/dnsmasq.conf 7 | 8 | dnsmasq 9 | 10 | while [ 1 ]; 11 | do 12 | sleep 3 13 | # kill and restart dnsmasq every three seconds 14 | # in case its configuration has changed 15 | pkill dnsmasq 16 | dnsmasq 17 | done 18 | -------------------------------------------------------------------------------- /mesos/NOTE.txt: -------------------------------------------------------------------------------- 1 | For build place pre-compiled mesos installation into file: 2 | mesos/mesos-base/files/mesos.tgz 3 | -------------------------------------------------------------------------------- /mesos/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mesos_dirs=$(ls -d mesos* spark-shell shark-shell) 4 | dir_list=("$mesos_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /mesos/deploy/deploy: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # determines which Mesos image is chosen 4 | MESOS_VERSION=0.13.0 5 | 6 | # set this value to the number of workers you want 7 | NUM_WORKERS=2 8 | 9 | if [[ "$USER" != "root" ]]; then 10 | echo "please run as: sudo $0" 11 | exit 1 12 | fi 13 | 14 | source ../../dnsmasq-precise/deploy/start_nameserver.sh 15 | source start_mesos_cluster.sh 16 | 17 | echo "*** Starting Mesos $MESOS_VERSION ***" 18 | start_nameserver 19 | sleep 5 20 | start_mesos_master 21 | sleep 40 22 | start_mesos_workers 23 | sleep 3 24 | print_cluster_info 25 | 26 | -------------------------------------------------------------------------------- /mesos/deploy/start_mesos_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MASTER=-1 4 | MASTER_IP= 5 | 6 | # starts the Mesos master container 7 | function start_mesos_master() { 8 | echo "starting Mesos master container" 9 | MASTER=$(sudo docker run -i -t -d -dns $NAMESERVER_IP -h master mesos-master:$MESOS_VERSION) 10 | echo "started master container: $MASTER" 11 | sleep 3 12 | MASTER_IP=$(sudo docker logs $MASTER 2>&1 | egrep '^MASTER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .") 13 | echo "MASTER_IP: $MASTER_IP" 14 | echo "address=\"/master/$MASTER_IP\"" >> $DNSFILE 15 | } 16 | 17 | # starts a number of Mesos workers 18 | function start_mesos_workers() { 19 | for i in `seq 1 $NUM_WORKERS`; do 20 | echo "starting Mesos worker container" 21 | hostname="worker${i}" 22 | WORKER=$(sudo docker run -d -dns $NAMESERVER_IP -h $hostname mesos-worker:${MESOS_VERSION} ${MASTER_IP} ${MASTER_IP}:5050) 23 | echo "started worker container: $WORKER" 24 | sleep 3 25 | WORKER_IP=$(sudo docker logs $WORKER 2>&1 | egrep '^WORKER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .") 26 | echo "address=\"/$hostname/$WORKER_IP\"" >> $DNSFILE 27 | done 28 | } 29 | 30 | # prints out information on the cluster 31 | function print_cluster_info() { 32 | echo "" 33 | echo "***********************************************************************" 34 | echo "visit Mesos WebUI at: http://$MASTER_IP:5050/" 35 | echo "visit Hadoop Namenode at: http://$MASTER_IP:50070" 36 | echo "" 37 | echo "start Spark Shell: sudo docker run -i -t -dns $NAMESERVER_IP -h spark-client spark-shell-mesos:0.7.3 $MASTER_IP" 38 | echo "start Shark Shell: sudo docker run -i -t -dns $NAMESERVER_IP -h shark-client shark-shell-mesos:0.7.0 $MASTER_IP" 39 | echo "" 40 | echo "ssh into master via: ssh -i ../../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${MASTER_IP}" 41 | echo "" 42 | echo "kill cluster via: docker/kill_all" 43 | echo "***********************************************************************" 44 | echo "" 45 | echo "to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:" 46 | echo "nameserver $NAMESERVER_IP" 47 | } 48 | 49 | -------------------------------------------------------------------------------- /mesos/mesos-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base Ubuntu Precise 12.04 LTS image 2 | # 3 | FROM amplab/shark-base:0.7.0 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 5 | 6 | #RUN apt-get install -y libcurl4-openssl-dev 7 | RUN apt-get install -y libcurl3 8 | 9 | # add Hadoop config file templates 10 | # NOTE: we rather do this as a single ADD statement 11 | # since we are running into 12 | # Error build: Unable to mount using aufs 13 | # Unable to mount using aufs 14 | # issue. For more information see 15 | # https://github.com/dotcloud/docker/issues/1171 16 | ADD files /root/mesos_files 17 | 18 | RUN (mv /root/mesos_files/mesos.tgz / && cd / && gunzip < mesos.tgz)|(cd /opt && tar -xvf -) && (rm /mesos.tgz && ln -s /opt/mesos /tmp/mesos) 19 | 20 | -------------------------------------------------------------------------------- /mesos/mesos-base/build: -------------------------------------------------------------------------------- 1 | sudo docker build -t amplab/mesos-base:0.13.0 . 2 | -------------------------------------------------------------------------------- /mesos/mesos-base/files/configure_mesos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/shark_files/configure_shark.sh 4 | 5 | function create_mesos_directories() { 6 | create_shark_directories 7 | mkdir /tmp/mesos 8 | chown hdfs.hdfs /tmp/mesos 9 | } 10 | 11 | function deploy_mesos_files() { 12 | deploy_shark_files 13 | } 14 | 15 | function configure_mesos() { 16 | configure_shark $1 17 | sed -i s/"^export MASTER="/"#export MASTER="/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 18 | echo "export MASTER=mesos://$1:5050" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh 19 | echo "export MESOS_NATIVE_LIBRARY=/opt/mesos/lib/libmesos-0.13.0.so" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh 20 | echo "export JAVA_LIBRARY_PATH=/opt/mesos/lib/libmesos-0.13.0.so" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh 21 | } 22 | 23 | function prepare_mesos() { 24 | create_mesos_directories 25 | deploy_mesos_files 26 | configure_mesos $1 27 | } 28 | -------------------------------------------------------------------------------- /mesos/mesos-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Mesos 2 | FROM amplab/mesos-base:0.13.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Setup a volume for data 6 | #VOLUME ["/data"] 7 | 8 | ADD files /root/mesos_master_files 9 | 10 | CMD ["/root/mesos_master_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /mesos/mesos-master/build: -------------------------------------------------------------------------------- 1 | sudo docker build -t amplab/mesos-master:0.13.0 . 2 | -------------------------------------------------------------------------------- /mesos/mesos-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/mesos_files/configure_mesos.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Mesos" 11 | prepare_mesos $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format 15 | service hadoop-namenode start 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Mesos Master" 23 | cp /root/mesos_master_files/run_mesos_master.sh / 24 | chmod a+rx /run_mesos_master.sh 25 | sudo -u hdfs LD_LIBRARY_PATH=$LD_LIBRARY_PATH /run_mesos_master.sh $IP 26 | -------------------------------------------------------------------------------- /mesos/mesos-master/files/run_mesos_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export LD_LIBRARY_PATH=/usr/lib/jvm/java-7-openjdk-amd64/jre/lib/amd64/server 3 | cd /opt/mesos/sbin && ./mesos-master --ip=$1 4 | -------------------------------------------------------------------------------- /mesos/mesos-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Mesos 2 | FROM amplab/mesos-base:0.13.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Setup a volume for data 6 | #VOLUME ["/data"] 7 | 8 | ADD files /root/mesos_worker_files 9 | 10 | # Add the entrypoint script for the master 11 | CMD ["-h"] 12 | ENTRYPOINT ["/root/mesos_worker_files/default_cmd"] 13 | -------------------------------------------------------------------------------- /mesos/mesos-worker/build: -------------------------------------------------------------------------------- 1 | sudo docker build -t amplab/mesos-worker:0.13.0 . 2 | -------------------------------------------------------------------------------- /mesos/mesos-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/mesos_files/configure_mesos.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Mesos" 9 | prepare_mesos $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Mesos Worker" 20 | cp /root/mesos_worker_files/run_mesos_worker.sh / 21 | chmod a+rx /run_mesos_worker.sh 22 | sudo -u hdfs HADOOP_HOME=$HADOOP_HOME LD_LIBRARY_PATH=$LD_LIBRARY_PATH /run_mesos_worker.sh $2 $IP 23 | -------------------------------------------------------------------------------- /mesos/mesos-worker/files/run_mesos_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export LD_LIBRARY_PATH=/usr/lib/jvm/java-7-openjdk-amd64/jre/lib/amd64/server 3 | cd /opt/mesos/sbin && ./mesos-slave --master=$1 --ip=$2 --hadoop_home=$HADOOP_HOME 4 | -------------------------------------------------------------------------------- /mesos/shark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM amplab/mesos-base:0.13.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | ADD files /root/shark_shell_files 6 | 7 | # Add the entrypoint script for the master 8 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"] 9 | -------------------------------------------------------------------------------- /mesos/shark-shell/build: -------------------------------------------------------------------------------- 1 | sudo docker build -t amplab/shark-shell-mesos:0.7.0 . 2 | -------------------------------------------------------------------------------- /mesos/shark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/mesos_files/configure_mesos.sh 4 | 5 | env 6 | 7 | echo "preparing Mesos" 8 | prepare_mesos $1 9 | 10 | echo "starting Shark Shell" 11 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark 12 | -------------------------------------------------------------------------------- /mesos/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM amplab/mesos-base:0.13.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | ADD files /root/spark_shell_files 6 | 7 | # Add the entrypoint script for the master 8 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 9 | -------------------------------------------------------------------------------- /mesos/spark-shell/build: -------------------------------------------------------------------------------- 1 | sudo docker build -t amplab/spark-shell-mesos:0.7.3 . 2 | -------------------------------------------------------------------------------- /mesos/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/mesos_files/configure_mesos.sh 4 | 5 | env 6 | 7 | echo "preparing Mesos" 8 | prepare_mesos $1 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://$1:9000/user/hdfs/test.txt 13 | 14 | echo "starting Spark Shell" 15 | cd $SPARK_HOME 16 | echo SPARK_HOME: `pwd` 17 | echo SHARK_VERSION: $SHARK_VERSION 18 | if [ "$SPARK_VERSION" == "0.8.0" ] || [ "$SPARK_VERSION" == "0.7.3" ]; then 19 | sudo -u hdfs HDFS_PREFIX=hdfs://${1}:9000 ./spark-shell 20 | else 21 | sudo -u hdfs HDFS_PREFIX=hdfs://${1}:9000 ./bin/spark-shell 22 | fi 23 | -------------------------------------------------------------------------------- /mesos/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /shark-0.7.0/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | shark_dirs=$(ls -d shark*) 4 | dir_list=("$shark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.7.3, Shark 0.7.0 2 | # Version 0.7.0 3 | # 4 | # Use spark-base as base 5 | FROM spark-base:0.7.3 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | # note: SPARK_VERSION should be inherited from spark-base 9 | # but for some reason isn't (?) 10 | ENV SPARK_VERSION 0.7.3 11 | ENV SHARK_VERSION 0.7.0 12 | ENV HIVE_VERSION 0.9.0 13 | 14 | # Install Shark 15 | ADD http://spark-project.org/download/shark-${SHARK_VERSION}-hadoop1-bin.tgz / 16 | RUN (cd / && gunzip < shark-${SHARK_VERSION}-hadoop1-bin.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /shark-${SHARK_VERSION}-hadoop1-bin.tgz 18 | 19 | # Add Shark config files and configure script 20 | ADD files /root/shark_files 21 | 22 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-base:0.7.0 . 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-base/files/configure_shark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | shark_files=( "/root/shark_files/shark-env.sh" ) 6 | hive_files=( "/root/shark_files/hive-site.xml" "/etc/hadoop/core-site.xml" ) 7 | 8 | function create_shark_directories() { 9 | create_spark_directories 10 | rm -rf /opt/metastore 11 | mkdir /opt/metastore 12 | chown hdfs.hdfs /opt/metastore 13 | } 14 | 15 | function deploy_shark_files() { 16 | deploy_spark_files 17 | for i in "${hive_files[@]}"; 18 | do 19 | filename=$(basename $i); 20 | cp $i /opt/hive-${HIVE_VERSION}-bin/conf/$filename; 21 | done 22 | for i in "${shark_files[@]}"; 23 | do 24 | filename=$(basename $i); 25 | cp $i /opt/shark-${SHARK_VERSION}/conf/$filename; 26 | done 27 | } 28 | 29 | function configure_shark() { 30 | configure_spark $1 31 | # Shark 32 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh 33 | sed -i s/__HIVE_HOME__/"\/opt\/hive-${HIVE_VERSION}-bin"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh 34 | # Hive 35 | sed -i s/__MASTER__/$1/ /opt/hive-0.9.0-bin/conf/hive-site.xml 36 | #sed -i s/__MASTER__/master/ /opt/hive-0.9.0-bin/conf/hive-site.xml 37 | } 38 | 39 | function prepare_shark() { 40 | create_shark_directories 41 | deploy_shark_files 42 | configure_shark $1 43 | } 44 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-base/files/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.default.name 4 | hdfs://__MASTER__:9000/ 5 | 6 | 7 | fs.defaultFS 8 | hdfs://__MASTER__:9000/ 9 | 10 | 11 | mapred.job.tracker 12 | NONE 13 | 14 | 15 | hive.exec.scratchdir 16 | /tmp/hive-scratch 17 | Scratch space for Hive jobs 18 | 19 | 20 | hive.metastore.local 21 | true 22 | 23 | 24 | javax.jdo.option.ConnectionURL 25 | jdbc:derby:;databaseName=metastore_db;create=true 26 | 27 | 28 | javax.jdo.option.ConnectionDriverName 29 | org.apache.derby.jdbc.EmbeddedDriver 30 | 31 | 32 | hive.metastore.metadb.dir 33 | file:///opt/metastore/metadb/ 34 | 35 | 36 | hive.metastore.uris 37 | file:///opt/metastore/metadb/ 38 | 39 | 40 | hive.metastore.warehouse.dir 41 | hdfs://__MASTER__:9000/user/hdfs/warehouse 42 | 43 | 44 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-base/files/shark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | . __SPARK_HOME__/conf/spark-env.sh 3 | export SHARK_MASTER_MEM=700m 4 | export HIVE_HOME=__HIVE_HOME__ 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark master 2 | FROM shark-base:0.7.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_master_files 7 | 8 | # Add default command for master 9 | CMD ["/root/shark_master_files/default_cmd"] 10 | 11 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-master:0.7.0 . 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run this as: 4 | # sudo docker run -i -t -d shark-master:$SHARK_VERSION 5 | 6 | source /root/shark_files/configure_shark.sh 7 | 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 9 | echo "MASTER_IP=$IP" 10 | 11 | echo "preparing Shark" 12 | prepare_shark $IP 13 | 14 | echo "starting Hadoop namenode" 15 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 16 | service hadoop-namenode start > /dev/null 2>&1 17 | 18 | echo "starting sshd" 19 | /usr/sbin/sshd 20 | 21 | sleep 5 22 | 23 | echo "starting Shark master" 24 | cp /root/shark_master_files/run_shark_master.sh / 25 | chmod a+rx /run_shark_master.sh 26 | sudo -u hdfs /run_shark_master.sh 27 | #$IP 28 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-master/files/run_shark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/shark-0.7.0/conf/shark-env.sh 3 | export PATH=$PATH:$SCALA_HOME/bin 4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar 5 | #/opt/spark-0.7.3/run spark.deploy.master.Master -i $1 6 | /opt/spark-0.7.3/run spark.deploy.master.Master -i master 7 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark master 2 | FROM shark-base:0.7.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_shell_files 7 | 8 | # Add default command for master 9 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"] 10 | 11 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-shell:0.7.0 . 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/shark_files/configure_shark.sh 4 | prepare_shark $1 5 | env 6 | sudo -u hdfs hadoop dfsadmin -safemode wait 7 | 8 | # Note: there are issues if the nameserver did not have time to 9 | # refresh its cache with this shell's hostname so give him time 10 | # to do so. 11 | sleep 3 12 | 13 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark 14 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-shell/files/test.shark: -------------------------------------------------------------------------------- 1 | CREATE TABLE src(key INT, value STRING); 2 | LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src; 3 | SELECT COUNT(1) FROM src; 4 | exit; 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark worker 2 | FROM shark-base:0.7.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_worker_files 7 | 8 | # Add the entrypoint script for the worker 9 | ENTRYPOINT ["/root/shark_worker_files/default_cmd"] 10 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-worker:0.7.0 . 5 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run this as: 4 | # sudo docker run -d shark-worker:${SHARK_VERSION} ${MASTER_IP} spark://${MASTER_IP}:7077 5 | 6 | source /root/shark_files/configure_shark.sh 7 | 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 9 | echo "WORKER_IP=$IP" 10 | 11 | echo "preparing Shark" 12 | prepare_shark $1 13 | 14 | echo "starting Hadoop datanode" 15 | service hadoop-datanode start 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Shark worker node" 23 | cp /root/shark_worker_files/run_shark_worker.sh / 24 | chmod a+rx /run_shark_worker.sh 25 | sudo -u hdfs /run_shark_worker.sh 26 | #$2 27 | -------------------------------------------------------------------------------- /shark-0.7.0/shark-worker/files/run_shark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/shark-0.7.0/conf/shark-env.sh 3 | export PATH=$PATH:$SCALA_HOME/bin 4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar 5 | #/opt/spark-0.7.3/run spark.deploy.worker.Worker $1 6 | #/opt/spark-0.7.3/run spark.deploy.worker.Worker -i $(hostname) spark://master:7077 7 | ${SPARK_HOME}/run spark.deploy.worker.Worker spark://master:7077 8 | -------------------------------------------------------------------------------- /shark-0.8.0/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | shark_dirs=$(ls -d shark*) 4 | dir_list=("$shark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.8.0, Shark 0.8.0 2 | # 3 | # Use spark-base as base 4 | FROM spark-base:0.8.0 5 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 6 | 7 | # note: SPARK_VERSION should be inherited from spark-base 8 | # but for some reason isn't (?) 9 | ENV SPARK_VERSION 0.8.0 10 | ENV SHARK_VERSION 0.8.0 11 | ENV HIVE_VERSION 0.9.0 12 | 13 | # Install Shark 14 | ADD https://github.com/amplab/shark/releases/download/v${SHARK_VERSION}/shark-${SHARK_VERSION}-bin-hadoop1.tgz / 15 | RUN (cd / && gunzip < shark-${SHARK_VERSION}-bin-hadoop1.tgz)|(cd /opt && tar -xvf -) 16 | RUN (ln -s /opt/shark-${SHARK_VERSION}-bin-hadoop1/shark-${SHARK_VERSION} /opt/shark-${SHARK_VERSION} && ln -s /opt/shark-${SHARK_VERSION}-bin-hadoop1/hive-${HIVE_VERSION}-shark-${SHARK_VERSION}-bin /opt/hive-${HIVE_VERSION}-bin && rm /shark-${SHARK_VERSION}-bin-hadoop1.tgz) 17 | 18 | # Add Shark config files and configure script 19 | ADD files /root/shark_files 20 | 21 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-base:0.8.0 . 5 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-base/files/configure_shark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | shark_files=( "/root/shark_files/shark-env.sh" ) 6 | hive_files=( "/root/shark_files/hive-site.xml" "/etc/hadoop/core-site.xml" ) 7 | 8 | function create_shark_directories() { 9 | create_spark_directories 10 | rm -rf /opt/metastore 11 | mkdir /opt/metastore 12 | chown hdfs.hdfs /opt/metastore 13 | } 14 | 15 | function deploy_shark_files() { 16 | deploy_spark_files 17 | for i in "${hive_files[@]}"; 18 | do 19 | filename=$(basename $i); 20 | cp $i /opt/hive-${HIVE_VERSION}-bin/conf/$filename; 21 | done 22 | for i in "${shark_files[@]}"; 23 | do 24 | filename=$(basename $i); 25 | cp $i /opt/shark-${SHARK_VERSION}/conf/$filename; 26 | done 27 | } 28 | 29 | function configure_shark() { 30 | configure_spark $1 31 | # Shark 32 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh 33 | sed -i s/__HIVE_HOME__/"\/opt\/hive-${HIVE_VERSION}-bin"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh 34 | # Hive 35 | sed -i s/__MASTER__/$1/ /opt/hive-0.9.0-bin/conf/hive-site.xml 36 | } 37 | 38 | function prepare_shark() { 39 | create_shark_directories 40 | deploy_shark_files 41 | configure_shark $1 42 | } 43 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-base/files/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.default.name 4 | hdfs://__MASTER__:9000/ 5 | 6 | 7 | fs.defaultFS 8 | hdfs://__MASTER__:9000/ 9 | 10 | 11 | mapred.job.tracker 12 | NONE 13 | 14 | 15 | hive.exec.scratchdir 16 | /tmp/hive-scratch 17 | Scratch space for Hive jobs 18 | 19 | 20 | hive.metastore.local 21 | true 22 | 23 | 24 | javax.jdo.option.ConnectionURL 25 | jdbc:derby:;databaseName=metastore_db;create=true 26 | 27 | 28 | javax.jdo.option.ConnectionDriverName 29 | org.apache.derby.jdbc.EmbeddedDriver 30 | 31 | 32 | hive.metastore.metadb.dir 33 | file:///opt/metastore/metadb/ 34 | 35 | 36 | hive.metastore.uris 37 | file:///opt/metastore/metadb/ 38 | 39 | 40 | hive.metastore.warehouse.dir 41 | hdfs://__MASTER__:9000/user/hdfs/warehouse 42 | 43 | 44 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-base/files/shark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | . __SPARK_HOME__/conf/spark-env.sh 3 | export SHARK_MASTER_MEM=700m 4 | export HIVE_HOME=__HIVE_HOME__ 5 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark master 2 | FROM shark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_master_files 7 | 8 | # Add default command for master 9 | CMD ["/root/shark_master_files/default_cmd"] 10 | 11 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-master:0.8.0 . 5 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run this as: 4 | # sudo docker run -i -t -d shark-master:$SHARK_VERSION 5 | 6 | source /root/shark_files/configure_shark.sh 7 | 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 9 | echo "MASTER_IP=$IP" 10 | 11 | echo "preparing Shark" 12 | prepare_shark $IP 13 | 14 | echo "starting Hadoop namenode" 15 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 16 | service hadoop-namenode start > /dev/null 2>&1 17 | 18 | echo "starting sshd" 19 | /usr/sbin/sshd 20 | 21 | sleep 5 22 | 23 | echo "starting Shark master" 24 | cp /root/shark_master_files/run_shark_master.sh / 25 | chmod a+rx /run_shark_master.sh 26 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION SHARK_VERSION=$SHARK_VERSION /run_shark_master.sh $IP 27 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-master/files/run_shark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/shark-0.8.0/conf/shark-env.sh 3 | export PATH=$PATH:$SCALA_HOME/bin 4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar 5 | 6 | /opt/spark-0.8.0/bin/start-master.sh 7 | 8 | while [ 1 ]; 9 | do 10 | tail -f /opt/spark-${SPARK_VERSION}/logs/*.out 11 | sleep 1 12 | done 13 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark master 2 | FROM shark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_shell_files 7 | 8 | # Add default command for master 9 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"] 10 | 11 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-shell:0.8.0 . 5 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/shark_files/configure_shark.sh 4 | prepare_shark $1 5 | env 6 | 7 | # Note: there are issues if the nameserver did not have time to 8 | # refresh its cache with this shell's hostname so give him time 9 | # to do so. 10 | sleep 3 11 | 12 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark 13 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-shell/files/test.shark: -------------------------------------------------------------------------------- 1 | CREATE TABLE src(key INT, value STRING); 2 | LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src; 3 | SELECT COUNT(1) FROM src; 4 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Shark worker 2 | FROM shark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Add run script 6 | ADD files /root/shark_worker_files 7 | 8 | # Add the entrypoint script for the worker 9 | ENTRYPOINT ["/root/shark_worker_files/default_cmd"] 10 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}shark-worker:0.8.0 . 5 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run this as: 4 | # sudo docker run -d shark-worker:${SHARK_VERSION} ${MASTER_IP} spark://${MASTER_IP}:7077 5 | 6 | source /root/shark_files/configure_shark.sh 7 | 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 9 | echo "WORKER_IP=$IP" 10 | 11 | echo "preparing Shark" 12 | prepare_shark $1 13 | 14 | echo "starting Hadoop datanode" 15 | service hadoop-datanode start 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Shark worker node" 23 | cp /root/shark_worker_files/run_shark_worker.sh / 24 | chmod a+rx /run_shark_worker.sh 25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION SHARK_VERSION=$SHARK_VERSION /run_shark_worker.sh 26 | -------------------------------------------------------------------------------- /shark-0.8.0/shark-worker/files/run_shark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/shark-0.8.0/conf/shark-env.sh 3 | export PATH=$PATH:$SCALA_HOME/bin 4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar 5 | /opt/spark-0.8.0/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 6 | -------------------------------------------------------------------------------- /spark-0.7.3/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spark_dirs=$(ls -d spark*) 4 | dir_list=("$spark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Hadoop 1.2.1 2 | # Version 1.2.1 3 | # 4 | FROM apache-hadoop-hdfs-precise:1.2.1 5 | 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | ENV SCALA_VERSION 2.9.3 9 | ENV SPARK_VERSION 0.7.3 10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION 11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION 12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH 13 | 14 | # Install Scala 15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz / 16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /scala-$SCALA_VERSION.tgz && chown -R hdfs.hdfs /opt/scala-$SCALA_VERSION 18 | 19 | # Install Spark 20 | ADD http://spark-project.org/download/spark-$SPARK_VERSION-prebuilt-hadoop1.tgz / 21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-prebuilt-hadoop1.tgz)|(cd /opt && tar -xvf -) 22 | RUN rm /spark-$SPARK_VERSION-prebuilt-hadoop1.tgz 23 | 24 | # Add Spark config files and configure script 25 | ADD files /root/spark_files 26 | 27 | #RUN cp /root/spark_files/spark-0.7.3_precomp_hadoop1.tar.gz / 28 | #RUN (cd / && gunzip < spark-0.7.3_precomp_hadoop1.tar.gz)|(cd /opt && tar -xvf -) 29 | #RUN rm /spark-0.7.3_precomp_hadoop1.tar.gz 30 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.7.3 . 5 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-base/files/configure_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/hadoop_files/configure_hadoop.sh 4 | 5 | function create_spark_directories() { 6 | create_hadoop_directories 7 | rm -rf /opt/spark-$SPARK_VERSION/work 8 | mkdir -p /opt/spark-$SPARK_VERSION/work 9 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work 10 | mkdir /tmp/spark 11 | chown hdfs.hdfs /tmp/spark 12 | # this one is for Spark shell logging 13 | rm -rf /var/lib/hadoop/hdfs 14 | mkdir -p /var/lib/hadoop/hdfs 15 | chown hdfs.hdfs /var/lib/hadoop/hdfs 16 | } 17 | 18 | function deploy_spark_files() { 19 | deploy_hadoop_files 20 | cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/ 21 | cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/ 22 | } 23 | 24 | function configure_spark() { 25 | configure_hadoop $1 26 | #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 27 | #sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 28 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 29 | sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 30 | } 31 | 32 | function prepare_spark() { 33 | create_spark_directories 34 | deploy_spark_files 35 | configure_spark $1 36 | } 37 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-base/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-base/files/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SCALA_HOME=/opt/scala-2.9.3 3 | export SPARK_HOME=__SPARK_HOME__ 4 | export SPARK_WORKER_CORES=1 5 | export SPARK_MEM=800m 6 | export SPARK_WORKER_MEMORY=1500m 7 | export SPARK_MASTER_MEM=1500m 8 | export SPARK_WORKER_CORES=1 9 | export HADOOP_HOME="/etc/hadoop" 10 | export MASTER="spark://master:7077" 11 | export SPARK_LOCAL_DIR=/tmp/spark 12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark " 13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 16 | #export SPARK_JAVA_OPTS 17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 18 | #export SPARK_DAEMON_JAVA_OPTS 19 | export JAVA_HOME=__JAVA_HOME__ 20 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.7.3 2 | # Version 0.7.3 3 | FROM spark-base:0.7.3 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 5 | 6 | ADD files /root/spark_master_files 7 | 8 | CMD ["/root/spark_master_files/default_cmd"] 9 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.7.3 . 5 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/spark_files/configure_spark.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Spark" 11 | prepare_spark $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 15 | service hadoop-namenode start > /dev/null 2>&1 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | # note: it seems important to sleep here 21 | sleep 5 22 | 23 | echo "starting Spark Master" 24 | cp /root/spark_master_files/run_spark_master.sh / 25 | chmod a+rx /run_spark_master.sh 26 | sudo -u hdfs /run_spark_master.sh 27 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-master/files/run_spark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-0.7.3/conf/spark-env.sh 3 | /opt/spark-0.7.3/run spark.deploy.master.Master -i master 4 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.7.3 2 | # Version 0.7.3 3 | FROM spark-base:0.7.3 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 5 | 6 | VOLUME [ "/etc/dnsmasq.d" ] 7 | 8 | ADD files /root/spark_shell_files 9 | 10 | # Add the entrypoint script for the master 11 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 12 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.7.3 . 5 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | env 6 | 7 | echo "preparing Spark" 8 | prepare_spark "master" 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop dfsadmin -safemode wait 13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt 14 | 15 | # Note: there are issues if the nameserver did not have time to 16 | # refresh its cache with this shell's hostname so give him time 17 | # to do so. 18 | sleep 3 19 | 20 | echo "starting Spark Shell" 21 | cd $SPARK_HOME 22 | sudo -u hdfs HDFS_PREFIX=hdfs://master:9000 ./spark-shell 23 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-shell/files/test.spark: -------------------------------------------------------------------------------- 1 | val hdfs_prefix = System.getenv("HDFS_PREFIX") 2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt") 3 | textFile.count() 4 | textFile.map({line => line}).collect() 5 | exit 6 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.7.3 2 | # Version 0.7.3 3 | FROM spark-base:0.7.3 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 5 | 6 | ADD files /root/spark_worker_files 7 | 8 | # Add the entrypoint script for the master 9 | CMD ["-h"] 10 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.7.3 . 5 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Spark" 9 | prepare_spark $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Spark Worker" 20 | cp /root/spark_worker_files/run_spark_worker.sh / 21 | chmod a+rx /run_spark_worker.sh 22 | sudo -u hdfs /run_spark_worker.sh 23 | -------------------------------------------------------------------------------- /spark-0.7.3/spark-worker/files/run_spark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-0.7.3/conf/spark-env.sh 3 | /opt/spark-0.7.3/run spark.deploy.worker.Worker spark://master:7077 4 | -------------------------------------------------------------------------------- /spark-0.8.0/NOTE.txt: -------------------------------------------------------------------------------- 1 | Many of the files here are in fact identical to the ones in the 2 | Spark 0.7.3 directory. However, since Docker does not follow 3 | symbolic links when it builds images we need the duplication. 4 | -------------------------------------------------------------------------------- /spark-0.8.0/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spark_dirs=$(ls -d spark*) 4 | dir_list=("$spark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.8.0 2 | # Version 0.8.0 3 | # 4 | FROM apache-hadoop-hdfs-precise:1.2.1 5 | 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | ENV SCALA_VERSION 2.9.3 9 | ENV SPARK_VERSION 0.8.0 10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION 11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION 12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH 13 | 14 | # Install Scala 15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz / 16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /scala-$SCALA_VERSION.tgz 18 | 19 | # Install Spark 20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz / 21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)|(cd /opt && tar -xvf -) 22 | RUN (ln -s /opt/spark-$SPARK_VERSION-incubating-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz) 23 | 24 | # Add Shark config files and configure script 25 | ADD files /root/spark_files 26 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.8.0 . 5 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-base/files/configure_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/hadoop_files/configure_hadoop.sh 4 | 5 | function create_spark_directories() { 6 | create_hadoop_directories 7 | rm -rf /opt/spark-$SPARK_VERSION/work 8 | mkdir -p /opt/spark-$SPARK_VERSION/work 9 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work 10 | mkdir /tmp/spark 11 | chown hdfs.hdfs /tmp/spark 12 | # this one is for Spark shell logging 13 | rm -rf /var/lib/hadoop/hdfs 14 | mkdir -p /var/lib/hadoop/hdfs 15 | chown hdfs.hdfs /var/lib/hadoop/hdfs 16 | rm -rf /opt/spark-$SPARK_VERSION/logs 17 | mkdir -p /opt/spark-$SPARK_VERSION/logs 18 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs 19 | } 20 | 21 | function deploy_spark_files() { 22 | deploy_hadoop_files 23 | cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/ 24 | cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/ 25 | } 26 | 27 | function configure_spark() { 28 | configure_hadoop $1 29 | #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 30 | sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 31 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 32 | sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 33 | } 34 | 35 | function prepare_spark() { 36 | create_spark_directories 37 | deploy_spark_files 38 | configure_spark $1 39 | } 40 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-base/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-base/files/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SCALA_HOME=/opt/scala-2.9.3 3 | export SPARK_HOME=__SPARK_HOME__ 4 | export SPARK_WORKER_CORES=1 5 | export SPARK_MEM=800m 6 | export SPARK_WORKER_MEMORY=1500m 7 | export SPARK_MASTER_MEM=1500m 8 | export SPARK_MASTER_IP=__MASTER__ 9 | export HADOOP_HOME="/etc/hadoop" 10 | export MASTER="spark://__MASTER__:7077" 11 | export SPARK_LOCAL_DIR=/tmp/spark 12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark " 13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 16 | #export SPARK_JAVA_OPTS 17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 18 | #export SPARK_DAEMON_JAVA_OPTS 19 | export JAVA_HOME=__JAVA_HOME__ 20 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Expose TCP ports 7077 8080 6 | EXPOSE 7077 8080 7 | 8 | ADD files /root/spark_master_files 9 | 10 | CMD ["/root/spark_master_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.8.0 . 5 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/spark_files/configure_spark.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Spark" 11 | prepare_spark $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 15 | service hadoop-namenode start > /dev/null 2>&1 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Spark Master" 23 | cp /root/spark_master_files/run_spark_master.sh / 24 | chmod a+rx /run_spark_master.sh 25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh 26 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-master/files/run_spark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /opt/spark-0.8.0/bin/start-master.sh 3 | 4 | while [ 1 ]; 5 | do 6 | tail -f /opt/spark-${SPARK_VERSION}/logs/*.out 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_shell_files 10 | 11 | # Add the entrypoint script for the master 12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 13 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.8.0 . 5 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | env 6 | 7 | echo "preparing Spark" 8 | prepare_spark "master" 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop dfsadmin -safemode wait 13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt 14 | 15 | cp /root/spark_shell_files/test.spark / 16 | 17 | # Note: there are issues if the nameserver did not have time to 18 | # refresh its cache with this shell's hostname so give him time 19 | # to do so. 20 | sleep 3 21 | 22 | echo "starting Spark Shell" 23 | 24 | cd $SPARK_HOME 25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./spark-shell 26 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-shell/files/test.spark: -------------------------------------------------------------------------------- 1 | val hdfs_prefix = System.getenv("HDFS_PREFIX") 2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt") 3 | textFile.count() 4 | textFile.map({line => line}).collect() 5 | exit 6 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.8.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_worker_files 10 | 11 | # Add the entrypoint script for the master 12 | CMD ["-h"] 13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"] 14 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.8.0 . 5 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Spark" 9 | prepare_spark $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Spark Worker" 20 | cp /root/spark_worker_files/run_spark_worker.sh / 21 | chmod a+rx /run_spark_worker.sh 22 | sudo -u hdfs /run_spark_worker.sh 23 | -------------------------------------------------------------------------------- /spark-0.8.0/spark-worker/files/run_spark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-0.8.0/conf/spark-env.sh 3 | ${SPARK_HOME}/spark-class org.apache.spark.deploy.worker.Worker $MASTER 4 | -------------------------------------------------------------------------------- /spark-0.9.0/NOTE.txt: -------------------------------------------------------------------------------- 1 | Many of the files here are in fact identical to the ones in the 2 | Spark 0.9.0 directory. However, since Docker does not follow 3 | symbolic links when it builds images we need the duplication. 4 | -------------------------------------------------------------------------------- /spark-0.9.0/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spark_dirs=$(ls -d spark*) 4 | dir_list=("$spark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.9.0 2 | # Version 0.9.0 3 | # 4 | FROM apache-hadoop-hdfs-precise:1.2.1 5 | 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | ENV SCALA_VERSION 2.10.3 9 | ENV SPARK_VERSION 0.9.0 10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION 11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION 12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH 13 | 14 | # Install Scala 15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz / 16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /scala-$SCALA_VERSION.tgz 18 | 19 | # Install Spark 20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz / 21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)|(cd /opt && tar -xvf -) 22 | RUN (ln -s /opt/spark-$SPARK_VERSION-incubating-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz) 23 | 24 | # Add Shark config files and configure script 25 | ADD files /root/spark_files 26 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.9.0 . 5 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-base/files/configure_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/hadoop_files/configure_hadoop.sh 4 | 5 | function create_spark_directories() { 6 | create_hadoop_directories 7 | rm -rf /opt/spark-$SPARK_VERSION/work 8 | mkdir -p /opt/spark-$SPARK_VERSION/work 9 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work 10 | mkdir /tmp/spark 11 | chown hdfs.hdfs /tmp/spark 12 | # this one is for Spark shell logging 13 | rm -rf /var/lib/hadoop/hdfs 14 | mkdir -p /var/lib/hadoop/hdfs 15 | chown hdfs.hdfs /var/lib/hadoop/hdfs 16 | rm -rf /opt/spark-$SPARK_VERSION/logs 17 | mkdir -p /opt/spark-$SPARK_VERSION/logs 18 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs 19 | } 20 | 21 | function deploy_spark_files() { 22 | deploy_hadoop_files 23 | cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/ 24 | cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/ 25 | } 26 | 27 | function configure_spark() { 28 | configure_hadoop $1 29 | #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 30 | sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 31 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 32 | sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 33 | } 34 | 35 | function prepare_spark() { 36 | create_spark_directories 37 | deploy_spark_files 38 | configure_spark $1 39 | } 40 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-base/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-base/files/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SCALA_HOME=/opt/scala-2.10.3 3 | export SPARK_HOME=__SPARK_HOME__ 4 | export SPARK_WORKER_CORES=1 5 | export SPARK_MEM=800m 6 | export SPARK_WORKER_MEMORY=1500m 7 | export SPARK_MASTER_MEM=1500m 8 | export SPARK_MASTER_IP=__MASTER__ 9 | export HADOOP_HOME="/etc/hadoop" 10 | export MASTER="spark://__MASTER__:7077" 11 | export SPARK_LOCAL_DIR=/tmp/spark 12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark " 13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 16 | #export SPARK_JAVA_OPTS 17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 18 | #export SPARK_DAEMON_JAVA_OPTS 19 | export JAVA_HOME=__JAVA_HOME__ 20 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Expose TCP ports 7077 8080 6 | EXPOSE 7077 8080 7 | 8 | ADD files /root/spark_master_files 9 | 10 | CMD ["/root/spark_master_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.9.0 . 5 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/spark_files/configure_spark.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Spark" 11 | prepare_spark $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 15 | service hadoop-namenode start > /dev/null 2>&1 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Spark Master" 23 | cp /root/spark_master_files/run_spark_master.sh / 24 | chmod a+rx /run_spark_master.sh 25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh 26 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-master/files/run_spark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /opt/spark-0.9.0/sbin/start-master.sh 3 | 4 | while [ 1 ]; 5 | do 6 | tail -f /opt/spark-${SPARK_VERSION}/logs/*.out 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_shell_files 10 | 11 | # Add the entrypoint script for the master 12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 13 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.9.0 . 5 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | env 6 | 7 | echo "preparing Spark" 8 | prepare_spark "master" 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop dfsadmin -safemode wait 13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt 14 | 15 | cp /root/spark_shell_files/test.spark / 16 | 17 | # Note: there are issues if the nameserver did not have time to 18 | # refresh its cache with this shell's hostname so give him time 19 | # to do so. 20 | sleep 3 21 | 22 | echo "starting Spark Shell" 23 | 24 | cd $SPARK_HOME 25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell 26 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-shell/files/test.spark: -------------------------------------------------------------------------------- 1 | val hdfs_prefix = System.getenv("HDFS_PREFIX") 2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt") 3 | textFile.count() 4 | textFile.map({line => line}).collect() 5 | exit 6 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_worker_files 10 | 11 | # Add the entrypoint script for the master 12 | CMD ["-h"] 13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"] 14 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.9.0 . 5 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Spark" 9 | prepare_spark $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Spark Worker" 20 | cp /root/spark_worker_files/run_spark_worker.sh / 21 | chmod a+rx /run_spark_worker.sh 22 | sudo -u hdfs /run_spark_worker.sh 23 | -------------------------------------------------------------------------------- /spark-0.9.0/spark-worker/files/run_spark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-0.9.0/conf/spark-env.sh 3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER 4 | -------------------------------------------------------------------------------- /spark-0.9.1/NOTE.txt: -------------------------------------------------------------------------------- 1 | Many of the files here are in fact identical to the ones in the 2 | Spark 0.9.1 directory. However, since Docker does not follow 3 | symbolic links when it builds images we need the duplication. 4 | -------------------------------------------------------------------------------- /spark-0.9.1/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spark_dirs=$(ls -d spark*) 4 | dir_list=("$spark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 0.9.1 2 | # Version 0.9.1 3 | # 4 | FROM apache-hadoop-hdfs-precise:1.2.1 5 | 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | ENV SCALA_VERSION 2.10.3 9 | ENV SPARK_VERSION 0.9.1 10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION 11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION 12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH 13 | 14 | # Install Scala 15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz / 16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /scala-$SCALA_VERSION.tgz 18 | 19 | # Install Spark 20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop1.tgz / 21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-bin-hadoop1.tgz)|(cd /opt && tar -xvf -) 22 | RUN (ln -s /opt/spark-$SPARK_VERSION-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-bin-hadoop1.tgz) 23 | 24 | # Add Shark config files and configure script 25 | ADD files /root/spark_files 26 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.9.1 . 5 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-base/files/configure_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/hadoop_files/configure_hadoop.sh 4 | 5 | function create_spark_directories() { 6 | create_hadoop_directories 7 | rm -rf /opt/spark-$SPARK_VERSION/work 8 | mkdir -p /opt/spark-$SPARK_VERSION/work 9 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work 10 | mkdir /tmp/spark 11 | chown hdfs.hdfs /tmp/spark 12 | # this one is for Spark shell logging 13 | rm -rf /var/lib/hadoop/hdfs 14 | mkdir -p /var/lib/hadoop/hdfs 15 | chown hdfs.hdfs /var/lib/hadoop/hdfs 16 | rm -rf /opt/spark-$SPARK_VERSION/logs 17 | mkdir -p /opt/spark-$SPARK_VERSION/logs 18 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs 19 | } 20 | 21 | function deploy_spark_files() { 22 | deploy_hadoop_files 23 | cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/ 24 | cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/ 25 | } 26 | 27 | function configure_spark() { 28 | configure_hadoop $1 29 | #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 30 | sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 31 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 32 | sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 33 | } 34 | 35 | function prepare_spark() { 36 | create_spark_directories 37 | deploy_spark_files 38 | configure_spark $1 39 | } 40 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-base/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-base/files/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SCALA_HOME=/opt/scala-2.10.3 3 | export SPARK_HOME=__SPARK_HOME__ 4 | export SPARK_WORKER_CORES=1 5 | export SPARK_MEM=800m 6 | export SPARK_WORKER_MEMORY=1500m 7 | export SPARK_MASTER_MEM=1500m 8 | export SPARK_MASTER_IP=__MASTER__ 9 | export HADOOP_HOME="/etc/hadoop" 10 | export MASTER="spark://__MASTER__:7077" 11 | export SPARK_LOCAL_DIR=/tmp/spark 12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark " 13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 16 | #export SPARK_JAVA_OPTS 17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 18 | #export SPARK_DAEMON_JAVA_OPTS 19 | export JAVA_HOME=__JAVA_HOME__ 20 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.1 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Expose TCP ports 7077 8080 6 | EXPOSE 7077 8080 7 | 8 | ADD files /root/spark_master_files 9 | 10 | CMD ["/root/spark_master_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.9.1 . 5 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/spark_files/configure_spark.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Spark" 11 | prepare_spark $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 15 | service hadoop-namenode start > /dev/null 2>&1 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Spark Master" 23 | cp /root/spark_master_files/run_spark_master.sh / 24 | chmod a+rx /run_spark_master.sh 25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh 26 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-master/files/run_spark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /opt/spark-0.9.1/sbin/start-master.sh 3 | 4 | while [ 1 ]; 5 | do 6 | tail -f /opt/spark-${SPARK_VERSION}/logs/*.out 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.1 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_shell_files 10 | 11 | # Add the entrypoint script for the master 12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 13 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.9.1 . 5 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | env 6 | 7 | echo "preparing Spark" 8 | prepare_spark "master" 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop dfsadmin -safemode wait 13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt 14 | 15 | cp /root/spark_shell_files/test.spark / 16 | 17 | # Note: there are issues if the nameserver did not have time to 18 | # refresh its cache with this shell's hostname so give him time 19 | # to do so. 20 | sleep 3 21 | 22 | echo "starting Spark Shell" 23 | 24 | cd $SPARK_HOME 25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell 26 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-shell/files/test.spark: -------------------------------------------------------------------------------- 1 | val hdfs_prefix = System.getenv("HDFS_PREFIX") 2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt") 3 | textFile.count() 4 | textFile.map({line => line}).collect() 5 | exit 6 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:0.9.1 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_worker_files 10 | 11 | # Add the entrypoint script for the master 12 | CMD ["-h"] 13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"] 14 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.9.1 . 5 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Spark" 9 | prepare_spark $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Spark Worker" 20 | cp /root/spark_worker_files/run_spark_worker.sh / 21 | chmod a+rx /run_spark_worker.sh 22 | sudo -u hdfs /run_spark_worker.sh 23 | -------------------------------------------------------------------------------- /spark-0.9.1/spark-worker/files/run_spark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-0.9.1/conf/spark-env.sh 3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER 4 | -------------------------------------------------------------------------------- /spark-1.0.0/NOTE.txt: -------------------------------------------------------------------------------- 1 | Many of the files here are in fact identical to the ones in the 2 | Spark 1.0.0 directory. However, since Docker does not follow 3 | symbolic links when it builds images we need the duplication. 4 | -------------------------------------------------------------------------------- /spark-1.0.0/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spark_dirs=$(ls -d spark*) 4 | dir_list=("$spark_dirs") 5 | 6 | # NOTE: the order matters but this is the right one 7 | for i in ${dir_list[@]}; do 8 | echo building $i; 9 | cd $i; 10 | cat build; 11 | . build; 12 | cd ..; 13 | done 14 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 1.0.0 2 | # Version 1.0.0 3 | # 4 | FROM apache-hadoop-hdfs-precise:1.2.1 5 | 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 7 | 8 | ENV SCALA_VERSION 2.10.3 9 | ENV SPARK_VERSION 1.0.0 10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION 11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION 12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH 13 | 14 | # Install Scala 15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz / 16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -) 17 | RUN rm /scala-$SCALA_VERSION.tgz 18 | 19 | # Install Spark 20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop1.tgz / 21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-bin-hadoop1.tgz)|(cd /opt && tar -xvf -) 22 | RUN (ln -s /opt/spark-$SPARK_VERSION-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-bin-hadoop1.tgz) 23 | 24 | # Add Shark config files and configure script 25 | ADD files /root/spark_files 26 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-base/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:1.0.0 . 5 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-base/files/configure_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/hadoop_files/configure_hadoop.sh 4 | 5 | function create_spark_directories() { 6 | create_hadoop_directories 7 | rm -rf /opt/spark-$SPARK_VERSION/work 8 | mkdir -p /opt/spark-$SPARK_VERSION/work 9 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work 10 | mkdir /tmp/spark 11 | chown hdfs.hdfs /tmp/spark 12 | # this one is for Spark shell logging 13 | rm -rf /var/lib/hadoop/hdfs 14 | mkdir -p /var/lib/hadoop/hdfs 15 | chown hdfs.hdfs /var/lib/hadoop/hdfs 16 | rm -rf /opt/spark-$SPARK_VERSION/logs 17 | mkdir -p /opt/spark-$SPARK_VERSION/logs 18 | chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs 19 | } 20 | 21 | function deploy_spark_files() { 22 | deploy_hadoop_files 23 | cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/ 24 | cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/ 25 | } 26 | 27 | function configure_spark() { 28 | configure_hadoop $1 29 | #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 30 | sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 31 | sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 32 | sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh 33 | } 34 | 35 | function prepare_spark() { 36 | create_spark_directories 37 | deploy_spark_files 38 | configure_spark $1 39 | } 40 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-base/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-base/files/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SCALA_HOME=/opt/scala-2.10.3 3 | export SPARK_HOME=__SPARK_HOME__ 4 | export SPARK_WORKER_CORES=1 5 | export SPARK_MEM=800m 6 | export SPARK_WORKER_MEMORY=1500m 7 | export SPARK_MASTER_MEM=1500m 8 | export SPARK_MASTER_IP=__MASTER__ 9 | export HADOOP_HOME="/etc/hadoop" 10 | export MASTER="spark://__MASTER__:7077" 11 | export SPARK_LOCAL_DIR=/tmp/spark 12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark " 13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 16 | #export SPARK_JAVA_OPTS 17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true " 18 | #export SPARK_DAEMON_JAVA_OPTS 19 | export JAVA_HOME=__JAVA_HOME__ 20 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:1.0.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Expose TCP ports 7077 8080 6 | EXPOSE 7077 8080 7 | 8 | ADD files /root/spark_master_files 9 | 10 | CMD ["/root/spark_master_files/default_cmd"] 11 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-master/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:1.0.0 . 5 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-master/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | env 4 | 5 | source /root/spark_files/configure_spark.sh 6 | 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 8 | echo "MASTER_IP=$IP" 9 | 10 | echo "preparing Spark" 11 | prepare_spark $IP 12 | 13 | echo "starting Hadoop Namenode" 14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1 15 | service hadoop-namenode start > /dev/null 2>&1 16 | 17 | echo "starting sshd" 18 | /usr/sbin/sshd 19 | 20 | sleep 5 21 | 22 | echo "starting Spark Master" 23 | cp /root/spark_master_files/run_spark_master.sh / 24 | chmod a+rx /run_spark_master.sh 25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh 26 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-master/files/run_spark_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /opt/spark-1.0.0/sbin/start-master.sh 3 | 4 | while [ 1 ]; 5 | do 6 | tail -f /opt/spark-${SPARK_VERSION}/logs/*.out 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-shell/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:1.0.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_shell_files 10 | 11 | # Add the entrypoint script for the master 12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"] 13 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-shell/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:1.0.0 . 5 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-shell/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | env 6 | 7 | echo "preparing Spark" 8 | prepare_spark "master" 9 | 10 | echo "adding test data to HDFS" 11 | cp /root/spark_shell_files/test.txt /tmp 12 | sudo -u hdfs hadoop dfsadmin -safemode wait 13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt 14 | 15 | cp /root/spark_shell_files/test.spark / 16 | 17 | # Note: there are issues if the nameserver did not have time to 18 | # refresh its cache with this shell's hostname so give him time 19 | # to do so. 20 | sleep 3 21 | 22 | echo "starting Spark Shell" 23 | 24 | cd $SPARK_HOME 25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell 26 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-shell/files/test.spark: -------------------------------------------------------------------------------- 1 | val hdfs_prefix = System.getenv("HDFS_PREFIX") 2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt") 3 | textFile.count() 4 | textFile.map({line => line}).collect() 5 | exit 6 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-shell/files/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | more test 3 | one more line 4 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Spark 2 | FROM spark-base:1.0.0 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu 4 | 5 | # Instead of using a random port, bind the worker to a specific port 6 | ENV SPARK_WORKER_PORT 8888 7 | EXPOSE 8888 8 | 9 | ADD files /root/spark_worker_files 10 | 11 | # Add the entrypoint script for the master 12 | CMD ["-h"] 13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"] 14 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-worker/build: -------------------------------------------------------------------------------- 1 | rm -f files/files.hash 2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash 3 | mv /tmp/files.hash files/files.hash 4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:1.0.0 . 5 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-worker/files/default_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/spark_files/configure_spark.sh 4 | 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') 6 | echo "WORKER_IP=$IP" 7 | 8 | echo "preparing Spark" 9 | prepare_spark $1 10 | 11 | echo "starting Hadoop Datanode" 12 | service hadoop-datanode start 13 | 14 | echo "starting sshd" 15 | /usr/sbin/sshd 16 | 17 | sleep 5 18 | 19 | echo "starting Spark Worker" 20 | cp /root/spark_worker_files/run_spark_worker.sh / 21 | chmod a+rx /run_spark_worker.sh 22 | sudo -u hdfs /run_spark_worker.sh 23 | -------------------------------------------------------------------------------- /spark-1.0.0/spark-worker/files/run_spark_worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/spark-1.0.0/conf/spark-env.sh 3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER 4 | -------------------------------------------------------------------------------- /test/test_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "$USER" != "root" ]]; then 4 | echo "please run as: sudo $0" 5 | exit 1 6 | fi 7 | 8 | BASEDIR=$(cd $(dirname $0); pwd)"/.." 9 | service_list=("spark:0.9.0" "shark:0.8.0" "spark:0.8.0" "spark:0.7.3" "shark:0.7.0" ) 10 | 11 | IMAGE_PREFIX="" 12 | #"amplab/" 13 | 14 | START=$(date) 15 | echo "starting tests at $START" > tests.log 16 | 17 | RESULT=0 18 | FAILED=0 19 | 20 | check_screen_session_alive() { 21 | screen -q -ls > /dev/null 22 | if (( $? < 10 )); then 23 | SCREEN_ALIVE=1 24 | fi 25 | } 26 | 27 | function wait_for_prompt() { 28 | service=$1 29 | OUTFILE=$2 30 | SCREEN_ALIVE=0 31 | 32 | if [[ "$service" == "spark" ]]; then 33 | query_string="scala>\s$" 34 | else 35 | query_string="^shark>\s$\|\s\s\s\s\s>\s$" 36 | fi 37 | 38 | tail -n 1 $OUTFILE | tr -d $'\r' | grep "$query_string" > /dev/null 39 | STOP="$?" 40 | until [[ "$STOP" == "0" ]]; do 41 | sleep 1 42 | check_screen_session_alive 43 | if [[ "$SCREEN_ALIVE" == "0" ]]; then 44 | sudo screen -S tmpshell -p 0 -X stuff $'\n' 45 | tail -n 1 $OUTFILE | tr -d $'\r' | grep "$query_string" > /dev/null 46 | STOP="$?" 47 | else 48 | break 49 | fi 50 | done 51 | } 52 | 53 | function check_result() { 54 | service=$1 55 | outfile=$2 56 | 57 | if [[ "$service" == "spark" ]]; then 58 | grep "Array(this is a test, more test, one more line)" $outfile > /dev/null 59 | RESULT="$?" 60 | elif [[ "$service" == "shark" ]]; then 61 | cat $outfile | tr -d $'\r' | grep "^500$" > /dev/null 62 | RESULT="$?" 63 | fi 64 | } 65 | 66 | # NOTE: the order matters but this is the right one 67 | for i in ${service_list[@]}; do 68 | service=$(echo $i | awk -F ":" '{print $1}') 69 | version=$(echo $i | awk -F ":" '{print $2}') 70 | dirname=${service}-${version} 71 | LOGFILE=${BASEDIR}/test/${dirname}.log 72 | OUTFILE=${BASEDIR}/test/${dirname}.out 73 | rm -f "$LOGFILE" "$OUTFILE" 74 | START=$(date) 75 | echo "starting tests at $START" > $LOGFILE 76 | $BASEDIR/deploy/deploy.sh -i ${IMAGE_PREFIX}${i} 1>>$LOGFILE 2>&1 77 | NAMESERVER_IP=$(grep NAMESERVER_IP ${dirname}.log | awk '{print $2}') 78 | MASTER_IP=$(grep MASTER_IP ${dirname}.log | awk '{print $2}') 79 | 80 | # we need this to set screen's output logfile 81 | cat << EOF >/tmp/screenrc 82 | logfile $OUTFILE 83 | EOF 84 | cat > cmd.sh < /dev/null 2>&1 108 | 109 | $BASEDIR/deploy/kill_all.sh $service 1>> $LOGFILE 2>&1 110 | $BASEDIR/deploy/kill_all.sh nameserver 1>> $LOGFILE 2>&1 111 | check_result "$service" "$OUTFILE" 112 | echo "RESULT: $RESULT" >> $LOGFILE 113 | END=$(date) 114 | echo "ending tests at $END" >> $LOGFILE 115 | let "FAILED=FAILED+RESULT" 116 | done 117 | 118 | echo "FAILED: $FAILED" 119 | 120 | if [[ "$FAILED" == "0" ]]; then 121 | exit 0 122 | else 123 | exit 1 124 | fi 125 | --------------------------------------------------------------------------------