├── .gitignore
├── README.md
├── apache-hadoop-hdfs-precise
    ├── Dockerfile
    ├── build
    └── files
    │   ├── authorized_keys
    │   ├── configure_hadoop.sh
    │   ├── core-site.xml
    │   ├── hdfs-site.xml
    │   └── id_rsa
├── build
    ├── README.txt
    ├── build_all.sh
    ├── push_all.sh
    └── tag_all.sh
├── deploy
    ├── deploy.sh
    ├── kill_all.sh
    ├── start_nameserver.sh
    ├── start_shell.sh
    └── start_spark_cluster.sh
├── dnsmasq-precise
    ├── Dockerfile
    ├── build
    └── files
    │   └── default_cmd
├── mesos
    ├── NOTE.txt
    ├── build
    ├── deploy
    │   ├── deploy
    │   └── start_mesos_cluster.sh
    ├── mesos-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   └── configure_mesos.sh
    ├── mesos-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_mesos_master.sh
    ├── mesos-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_mesos_worker.sh
    ├── shark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   └── default_cmd
    └── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── test.txt
├── shark-0.7.0
    ├── build
    ├── shark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_shark.sh
    │   │   ├── hive-site.xml
    │   │   └── shark-env.sh
    ├── shark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_shark_master.sh
    ├── shark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── test.shark
    └── shark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_shark_worker.sh
├── shark-0.8.0
    ├── build
    ├── shark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_shark.sh
    │   │   ├── hive-site.xml
    │   │   └── shark-env.sh
    ├── shark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_shark_master.sh
    ├── shark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── test.shark
    └── shark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_shark_worker.sh
├── spark-0.7.3
    ├── build
    ├── spark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_spark.sh
    │   │   ├── log4j.properties
    │   │   └── spark-env.sh
    ├── spark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_spark_master.sh
    ├── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   ├── test.spark
    │   │   └── test.txt
    └── spark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_spark_worker.sh
├── spark-0.8.0
    ├── NOTE.txt
    ├── build
    ├── spark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_spark.sh
    │   │   ├── log4j.properties
    │   │   └── spark-env.sh
    ├── spark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_spark_master.sh
    ├── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   ├── test.spark
    │   │   └── test.txt
    └── spark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_spark_worker.sh
├── spark-0.9.0
    ├── NOTE.txt
    ├── build
    ├── spark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_spark.sh
    │   │   ├── log4j.properties
    │   │   └── spark-env.sh
    ├── spark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_spark_master.sh
    ├── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   ├── test.spark
    │   │   └── test.txt
    └── spark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_spark_worker.sh
├── spark-0.9.1
    ├── NOTE.txt
    ├── build
    ├── spark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_spark.sh
    │   │   ├── log4j.properties
    │   │   └── spark-env.sh
    ├── spark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_spark_master.sh
    ├── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   ├── test.spark
    │   │   └── test.txt
    └── spark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_spark_worker.sh
├── spark-1.0.0
    ├── NOTE.txt
    ├── build
    ├── spark-base
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── configure_spark.sh
    │   │   ├── log4j.properties
    │   │   └── spark-env.sh
    ├── spark-master
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   └── run_spark_master.sh
    ├── spark-shell
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │   │   ├── default_cmd
    │   │   ├── test.spark
    │   │   └── test.txt
    └── spark-worker
    │   ├── Dockerfile
    │   ├── build
    │   └── files
    │       ├── default_cmd
    │       └── run_spark_worker.sh
└── test
    └── test_all.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | files.hash
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dockerfiles for Spark and Shark
  2 | 
  3 | ## Contents
  4 | 
  5 | Dockerfiles to build Spark and Shark images for testing and
  6 | development.
  7 | 
  8 | ## Requirements
  9 | 
 10 | Tested on Ubuntu 12.04 (Docker version 0.6.4), Ubuntu 13.10 (Docker 0.7.0 and 0.9.0) with the virtual
 11 | switch
 12 | 	lxcbr0
 13 | enabled. For running Docker on Mac and Windows see [the docs](http://docs.docker.io).
 14 | Also tested inside the VirtualBox Tiny Core Linux VirtualBox VM for Docker on
 15 | Mac.
 16 | 
 17 | Note: the earlier version of the scripts had problems with newer
 18 | versions of Docker (0.7). If you encounter issues please pull the
 19 | latest changes from https://github.com/amplab/docker-scripts.git
 20 | master branch.
 21 | 
 22 | ## Tips for running on Mac OS
 23 | If you are running on Mac OS, installed as described
 24 | [in the Docker installation docs](http://docs.docker.io/en/latest/installation/mac/)
 25 | you need to run all commands inside the Docker virtual machine by first ssh-ing into it:
 26 | 
 27 | <pre>
 28 | $ ./boot2docker ssh
 29 | # User: docker
 30 | # Pwd:  tcuser
 31 | </pre>
 32 | 
 33 | Then make sure that `python` is installed. Otherwise install it via
 34 | `tce-ab` (search for python and install `python.tcz`). Newer versions
 35 | of the image that comes with boot2docker also do not have `bash` installed
 36 | (install package `bash.tcz`) which is required for the deployment scripts.
 37 | 
 38 | Further, make sure that your virtual machine running the Docker daemon and
 39 | the containers has sufficient memory allocated (at least 2GB for two Spark worker
 40 | containers and one master container). This can be done inside the Virtual Box
 41 | GUI under the properties of the virtual machine.
 42 | 
 43 | Finally, `boot2docker save` is a good way to perserve changes to the image
 44 | between restarts of the virtual machine or host computer,
 45 | for example the scripts come in the cloned git repository (see below). 
 46 | 
 47 | ## Testing
 48 | 
 49 | First clone the repository:
 50 | 
 51 | 	$ git clone https://github.com/amplab/docker-scripts.git
 52 | 
 53 | This repository contains deploy scripts and the sources for the Docker
 54 | image files, which can be easily modified. The main deploy script
 55 | takes the following options.
 56 | 
 57 | <pre>
 58 | $ sudo ./deploy/deploy.sh
 59 | usage: ./deploy.sh -i &lt;image&gt; [-w &lt;&#35;workers&gt;] [-v &lt;data_directory&gt;] [-c]
 60 | 
 61 |   image:    spark or shark image from:
 62 |                  amplab/spark:0.9.0  amplab/spark:0.9.1  amplab/spark:1.0.0
 63 |                  amplab/shark:0.8.0
 64 | </pre>
 65 | 
 66 | The script either starts a standalone Spark cluster or a standalone
 67 | Spark/Shark cluster for a given number of worker nodes. Note that
 68 | on the first call it may take a while for Docker to download the
 69 | various images from the repository,
 70 | 
 71 | In addition to Spark (and Shark) the cluster also runs a Hadoop HDFS
 72 | filesystem. When the deploy script is run it generates one container
 73 | for the master node, one container for each worker node and one extra
 74 | container running a Dnsmasq DNS forwarder. The latter one can also be
 75 | used to resolve node names on the host, for example to access the
 76 | worker logs via the Spark web UI.
 77 | 
 78 | Optionally one can set the number of workers (default: 2) and a data directory
 79 | which is a local path on the host that can be mounted on the master and
 80 | worker containers and will appear under /data.
 81 | 
 82 | Both the Spark and Shark shells are started in a separate container.
 83 | This container can be directly started from the deploy script by
 84 | passing "-c" to the deploy script.
 85 | 
 86 | Each node (worker and master) also runs a sshd which is
 87 | _pre-configured with the given RSA key_. Note that you should change
 88 | this key if you plan to expose services running inside the containers.
 89 | Since the permissions of the key when cloned from the repository are
 90 | likely wrong you need to change them if you intend to log in with ssh:
 91 | 
 92 | <pre>
 93 | chmod go -rwx apache-hadoop-hdfs-precise/files/id_rsa
 94 | </pre>
 95 | 
 96 | ### Example: Running a Spark cluster
 97 | 
 98 | Starting from the directory in which the repository was cloned do
 99 | 
100 | #### Deploy the cluster
101 | 
102 | 	$ sudo ./deploy/deploy.sh -i amplab/spark:0.9.0 -w 3 
103 | 
104 | #### Wait a few seconds
105 | 
106 | Wait for the "cluster" to come up. Note that it can take longer to download
107 | the container images the first time but after that the process is fairly quick.
108 | When the cluster comes up you should see something like this:
109 | 
110 | <pre>
111 | > sudo ./deploy.sh -i amplab/spark:0.9.0 -w 3 
112 | *** Starting Spark 0.9.0 ***
113 | starting nameserver container
114 | started nameserver container:  069557913d98a37caf43f8238dfdf181aea5ab30eb42e382db83307e277cfa9e
115 | DNS host->IP file mapped:      /tmp/dnsdir_12015/0hosts
116 | NAMESERVER_IP:                 172.17.0.8
117 | waiting for nameserver to come up 
118 | starting master container
119 | started master container:      f50a65d2ef7b17bffed7075ac2de4a7b52c26adff15bdbe14d3280ef4991c9d6
120 | MASTER_IP:                     172.17.0.9
121 | waiting for master ........
122 | waiting for nameserver to find master 
123 | starting worker container
124 | started worker container:  576d7d223f59a6da7a0e73311d1e082fad27895aef53edf3635264fb00b70258
125 | starting worker container
126 | started worker container:  5672ea896e179b51fe2f1ae5d542c35706528cd3a768ba523324f434bb2b2413
127 | starting worker container
128 | started worker container:  3cdf681f7c99c1e19f7b580ac911e139923e9caca943fd006fb633aac5b20001
129 | waiting for workers to register .....
130 | 
131 | ***********************************************************************
132 | start shell via:            sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/spark-shell:0.9.0 -n 069557913d98a37caf43f8238dfdf181aea5ab30eb42e382db83307e277cfa9e 
133 | 
134 | visit Spark WebUI at:       http://172.17.0.9:8080/
135 | visit Hadoop Namenode at:   http://172.17.0.9:50070
136 | ssh into master via:        ssh -i /home/andre/docker-scripts/deploy/../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@172.17.0.9
137 | 
138 | /data mapped:               
139 | 
140 | kill master via:           sudo docker kill f50a65d2ef7b17bffed7075ac2de4a7b52c26adff15bdbe14d3280ef4991c9d6
141 | ***********************************************************************
142 | 
143 | to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:
144 | nameserver 172.17.0.8
145 | </pre>
146 | 
147 | #### Start the Spark shell container as shown above, for example:
148 | 
149 | 	$ sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/spark-shell:0.9.0 -n 069557913d98a37caf43f8
150 | 
151 | The parameter passed with -n is the ID of the nameserver container.
152 | Then attach to the running shell via the given command, for example:
153 | 
154 |     $ sudo docker attach 9ac49b09bf18a13c7
155 | 
156 | If the screen appears to stay blank just hit return to get to the prompt.
157 | 
158 | #### Execute an example:
159 | 
160 | <pre>
161 | scala&gt; val textFile = sc.textFile("hdfs://master:9000/user/hdfs/test.txt")
162 | scala&gt; textFile.count()
163 | scala&gt; textFile.map({line => line}).collect()
164 | </pre>
165 | 
166 | 
167 | #### Terminate the cluster:
168 | 
169 | 	$ sudo ./deploy/kill_all.sh spark
170 | 	$ sudo ./deploy/kill_all.sh nameserver
171 | 
172 | ### Shark
173 | 
174 | Basically the same steps apply only that the Shark images are chosen instead of the Spark ones
175 | (the former contain in addition to Spark the Shark binaries).
176 | 
177 | #### Deploy the cluster
178 | 
179 | 	$ sudo ./deploy/deploy.sh -i amplab/shark:0.8.0 -w 3
180 | 
181 | #### Wait a few seconds
182 | 
183 | Wait for the "cluster" to come up. Note that it can take longer to download
184 | the container images the first time but after that the process is fairly quick.
185 | When the cluster comes up you should see something like this:
186 | 
187 | <pre>
188 | *** Starting Shark 0.8.0 + Spark ***
189 | starting nameserver container
190 | started nameserver container:  952d22e085c3b74e829e006ab536d45d31800c463832e43d8679bbf3d703940e
191 | DNS host->IP file mapped:      /tmp/dnsdir_30578/0hosts
192 | NAMESERVER_IP:                 172.17.0.13
193 | waiting for nameserver to come up 
194 | starting master container
195 | started master container:      169f253eaddadb19b6eb28e79f148eef892f20d34602ffb42d3e57625dc61652
196 | MASTER_IP:                     172.17.0.14
197 | waiting for master ........
198 | waiting for nameserver to find master 
199 | starting worker container
200 | started worker container:  1c6920c96d5ad684a2f591bfb334323c5854cdd7a0da49982baaf77dc4d62ac7
201 | starting worker container
202 | started worker container:  7250dcfb882e2d17441c8c59361d10d8c59afb2b295719ba35f59bc72c6f17a5
203 | starting worker container
204 | started worker container:  26823e188a2a5a5897ed4b9bf0fca711dc7f98674fe62eb78fb49cf031bec79c
205 | waiting for workers to register .......
206 | 
207 | ***********************************************************************
208 | start shell via:            sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/shark-shell:0.8.0 -n 952d22e085c3b74e829e006ab536d45d31800c463832e43d8679bbf3d703940e 
209 | 
210 | visit Spark WebUI at:       http://172.17.0.14:8080/
211 | visit Hadoop Namenode at:   http://172.17.0.14:50070
212 | ssh into master via:        ssh -i /home/andre/docker-scripts/deploy/../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@172.17.0.14
213 | 
214 | /data mapped:               
215 | 
216 | kill master via:           sudo docker kill 169f253eaddadb19b6eb28e79f148eef892f20d34602ffb42d3e57625dc61652
217 | ***********************************************************************
218 | 
219 | to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:
220 | nameserver 172.17.0.13
221 | </pre>
222 | 
223 | #### Start the Shark shell container as shown above, for example:
224 | 
225 | 	$ sudo /home/andre/docker-scripts/deploy/start_shell.sh -i amplab/shark-shell:0.8.0 -n 952d22e085c3b74e829e00
226 | 
227 | The parameter passed with -n is the ID of the nameserver container.
228 | Then attach to the running shell via the given command, for example:
229 | 
230 |     $ sudo docker attach 9ac49b09bf18a13c7
231 | 
232 | If the screen appears to stay blank just hit return to get to the prompt.
233 | 
234 | #### Execute an example:
235 | 
236 | <pre>
237 | shark> CREATE TABLE src(key INT, value STRING);
238 | shark> LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src;
239 | shark> SELECT COUNT(1) FROM src;
240 | </pre>
241 | 
242 | #### Terminate the cluster:
243 | 
244 | 	$ sudo ./deploy/kill_all.sh shark
245 | 	$ sudo ./deploy/kill_all.sh nameserver
246 | 
247 | ## Building
248 | 
249 | If you prefer to build the images yourself (or intend to modify them) rather
250 | than downloading them from the Docker repository, you can build
251 | all Spark and Shark images in the correct order via the build script:
252 | 
253 | 	$ ./build/build_all.sh
254 | 
255 | The script builds the images in an order that satisfies the chain of
256 | dependencies:
257 | 
258 | apache-hadoop-hdfs-precise -> spark-base -> spark-{master, worker, shell}
259 | 
260 | apache-hadoop-hdfs-precise -> spark-base -> shark-base -> shark-{master, worker, shell}
261 | 
262 | You can always (re-)build single images by cd-ing into the image directory and doing
263 | 
264 | 	$ . build
265 | 
266 | ## Best practices for Dockerfiles and startup scripts
267 | 
268 | The following are just some comments that made the generation of the images easier. It
269 | is not enforced in any way by Docker.
270 | 
271 | The images and startup scripts follow the following structure in order to reuse
272 | as much as possible of the image they depend on. There are two types of images,
273 | <em>base</em> images and <em>leaf</em> images. Leaf images, as the name suggests,
274 | are images that are leafs in the dependency tree. For example, spark-base as a base
275 | image depends on apache-hadoop-hdfs-precise. spark-master depends on spark-base as
276 | its base image and is itself a leaf.
277 | 
278 | In addition to its Dockerfile, each image has a
279 | 	files/
280 | subdirectory in its image directory that contains files (config files, data files) that will be copied
281 | to the
282 | 	root/<em>image_name</em>_files
283 | directory inside the image.
284 | 
285 | ### Base images
286 | 
287 | Base images are images that are intended to be extended by other images and therefore do not
288 | have a default command or entry point. They are good for testing though, e.g, by running
289 | 	/bin/bash
290 | inside them. 
291 | 
292 | 
293 | For base images such as spark-base, besides data files the
294 | 	files/
295 | directory also contains
296 | 	files/configure_spark.sh
297 | which is a script that contains four functions
298 | 
299 | *	create_spark_directories
300 |   for creating required directories such as the working directory
301 | *	deploy_spark_files
302 |   that would copy files from
303 | 	/root/<em>image_name</em>_files
304 |   to required system path locations
305 | *	configure_spark
306 |   that changes settings in config files and takes the IP of the master as argument
307 | *	prepare_spark
308 |   that calls the previous three in the given order and takes the IP of the master as argument
309 | 
310 | 
311 | All of the functions of a __base-image__'s configure script, so also inside
312 | 	files/configure_spark.sh
313 | except __prepare_spark__ first call their corresponding functions in the image the spark-base image depends on (apache-hadoop-hdfs-precise in this case). Therefore all the underlying services get initialized before the top level service. 
314 | 
315 | ### Leaf images
316 | 
317 | For leaf images such as spark-master, besides data files the
318 | 	files/
319 | directory also contains
320 | 	files/default_cmd
321 | that is chosen in the image's Dockerfile to be the default command (or entry point) to the image. This means the command
322 | inside is executed whenever the container is started.
323 | 
324 | 
325 | The default command script executes the following steps in this order
326 | 
327 | 1. The first thing the default command does is call the prepare
328 |    function of the configure script inside its base image. In this case, the default command script calls function
329 | 	prepare_spark
330 |    inside
331 | 	/root/spark-base/configure_spark.sh
332 | which is the location the configure script of spark-base was copied to.
333 | 2. After that, now that the base images configuration (and the configuration of the images it inherits from) has completed, the
334 |    default command may start services it relies on, such as the Hadoop namenode service in the case of spark-master.
335 | 3. Finally, the default command script of spark-master runs a second script under userid hdfs
336 |    (the Hadoop HDFS super user), which is
337 | 	files/files/run_spark_master.sh
338 |    that actually starts the master.
339 |  
340 | 
341 | The spark-worker default command proceeds along the same lines but starts a Spark worker with a Hadoop datanode instead.
342 | 
343 | ## Tips
344 | 
345 | ### Name resolution on host
346 | 
347 | In order to resolve names (such as "master", "worker1", etc.) add the IP
348 | of the nameserver container to the top of /etc/resolv.conf on the host.
349 | 
350 | ### Maintaining local Docker image repository
351 | 
352 | After a while building and debugging images the local image repository gets
353 | full of intermediate images that serve no real purpose other than
354 | debugging a broken build. To remove these do
355 | 
356 | 	$ sudo docker images | grep "<none>" | awk '{print $3}' | xargs sudo docker rmi
357 | 
358 | Also data from stopped containers tend to accumulate. In order to remove all container data (__only do when no containers are running__) do
359 | 
360 | 	$ sudo docker rm `sudo docker ps -a -q`
361 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Ubuntu Precise 12.04 LTS image
 2 | #
 3 | FROM ubuntu:precise
 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 5 | 
 6 | # Setup a volume for data
 7 | VOLUME ["/data"]
 8 | 
 9 | # Set correct source list
10 | RUN echo "deb http://archive.ubuntu.com/ubuntu precise main universe" > /etc/apt/sources.list
11 | RUN echo "deb http://archive.ubuntu.com/ubuntu precise-updates main universe" >> /etc/apt/sources.list
12 | 
13 | # install a few other useful packages plus Open Jdk 7
14 | RUN apt-get update && apt-get upgrade -y && apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server iputils-ping python2.7
15 | 
16 | # Install Hadoop
17 | ADD http://mirror.sdunix.com/apache/hadoop/common/hadoop-1.2.1/hadoop_1.2.1-1_x86_64.deb /root/
18 | RUN dpkg -i /root/hadoop_1.2.1-1_x86_64.deb && rm /root/hadoop_1.2.1-1_x86_64.deb
19 | 
20 | # Docker messes up /etc/hosts and adds two entries for 127.0.0.1
21 | # we try to recover from that by giving /etc/resolv.conf and therefore
22 | # the nameserver priority
23 | RUN sed -i s/"files dns"/"dns files"/ /etc/nsswitch.conf
24 | 
25 | # add Hadoop config file templates
26 | ADD files /root/hadoop_files
27 | 
28 | # Set JAVA_HOME
29 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64
30 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}apache-hadoop-hdfs-precise:1.2.1 .
5 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/files/authorized_keys:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDp2atNK3bux0z3d2Aojkl231Lf6X7HZUYIBt3XzUs+wnTzzB/eH2ubS5Wdwyy5daA4itsvX6hI1o/LQOfRBdjXqIVl+IFXFdwNQ0saCSNh65O2ynuMwsxUXhBJAGoBg6sTXq1ZPNQk1JqopUBP6+H4jpnKFW3JosON9QopQdkkYIz/frHs3HojfbydQesGNovanKrGYV3QeFVQDPxseufRZtHjrTk1hQ3FEayQCTyqJ8JDE6DMrirNEVBTuuNZ/Z2afPLWcZIKQ46E73p9HhqcaWEph6xQ3Ha/WV9oK0jenfz4b+sGrUItTbzuP8SsUiA4yZrZaN4BubDi4oPALOr/ root@423e412aa505
2 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/files/configure_hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | hadoop_files=( "/root/hadoop_files/core-site.xml"  "/root/hadoop_files/hdfs-site.xml" )
 4 | 
 5 | function create_hadoop_directories() {
 6 |     rm -rf /root/.ssh
 7 |     mkdir /root/.ssh
 8 |     chmod go-rx /root/.ssh
 9 |     mkdir /var/run/sshd
10 | }
11 | 
12 | function deploy_hadoop_files() {
13 |     for i in "${hadoop_files[@]}";
14 |     do
15 |         filename=$(basename $i);
16 |         cp $i /etc/hadoop/$filename;
17 |     done
18 |     cp /root/hadoop_files/id_rsa /root/.ssh
19 |     chmod go-rwx /root/.ssh/id_rsa
20 |     cp /root/hadoop_files/authorized_keys /root/.ssh/authorized_keys
21 |     chmod go-wx /root/.ssh/authorized_keys
22 | }		
23 | 
24 | function configure_hadoop() {
25 |     sed -i s/__MASTER__/$1/ /etc/hadoop/core-site.xml
26 |     sed -i s/"JAVA_HOME=\/usr\/lib\/jvm\/java-6-sun"/"JAVA_HOME=\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /etc/hadoop/hadoop-env.sh
27 | }
28 | 
29 | function prepare_hadoop() {
30 |     create_hadoop_directories
31 |     deploy_hadoop_files
32 |     configure_hadoop $1
33 | }
34 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/files/core-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |  <property>
3 |   <name>fs.default.name</name>
4 |   <value>hdfs://__MASTER__:9000</value>
5 |  </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/files/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |      <property>
3 |          <name>dfs.replication</name>
4 |          <value>1</value>
5 |      </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/apache-hadoop-hdfs-precise/files/id_rsa:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEpAIBAAKCAQEA6dmrTSt27sdM93dgKI5Jdt9S3+l+x2VGCAbd181LPsJ088wf
 3 | 3h9rm0uVncMsuXWgOIrbL1+oSNaPy0Dn0QXY16iFZfiBVxXcDUNLGgkjYeuTtsp7
 4 | jMLMVF4QSQBqAYOrE16tWTzUJNSaqKVAT+vh+I6ZyhVtyaLDjfUKKUHZJGCM/36x
 5 | 7Nx6I328nUHrBjaL2pyqxmFd0HhVUAz8bHrn0WbR4605NYUNxRGskAk8qifCQxOg
 6 | zK4qzRFQU7rjWf2dmnzy1nGSCkOOhO96fR4anGlhKYesUNx2v1lfaCtI3p38+G/r
 7 | Bq1CLU287j/ErFIgOMma2WjeAbmw4uKDwCzq/wIDAQABAoIBAQCBgFZZ/Pj3EI2x
 8 | +XzZ2LocR144u7DGsXHP3iWabYj+72ce3+rB8np/3KK1ZDFvXxFkXpk1Ke8irxeg
 9 | gogd+/PysdN1/eF6nZNoEN0VRPxALNp3frhe4j2PdyvjkYQi5IynxGWRJpuA7e/b
10 | 9u+fksxn/mhyPd23rRhIk+uVn26lsnccHhCkfqr+Szm/xFsTUhYQ1B8bfrqhA1Le
11 | WRrBa03JXocd2y3TdzeaQ+AtvbpAy9Fc28N7xkDsuh+H1y74jRhFzBXd4WnYuxze
12 | /PAD3hpgtCDGGnGpwE2SMM8fZJ7vLOPAsMUuz1tvLbKcoTTdaUw4fBur/XQHloW7
13 | k7adoW6BAoGBAP0bdE1uynnwZOFDhmpMvdYfodwlv3Far+QZwVroSa64YWBaeAef
14 | v0AO75p/EiQJEGWB9bgOAyrbOFdRqLtUF14lQw4ZLUV7sQu/o2Z0sVMSRCVWuNDf
15 | W8sk74RtH3WB7lutOMP3WyYopOUZtTK1rZrRNxD4+edq7+utAba+DLS/AoGBAOyF
16 | 31hype9DkOHgD/jWU7tNrJprLkNkSHe/Aq5JdKesgw84AOSKO4W1/uXOly4VOt6Z
17 | 54eeW1gt+uKT292GEl66TO8PIxszfsUzpYpTKkSzrl5OsM9hUlitJwpff/D9Mbxw
18 | fZWt0EjKlBQWc83sMBwCe8ZyNh/WueBIKH5HjhnBAoGAEwFRvVK5X2iemo+Qc0Dp
19 | 7D8Zz0cCVgeiN3V7oFDa34S2wx5n7uKe4Ld+ZFJwUUZg9c5JXhWnRTuKwnu+OLq6
20 | unX/z/ox/Qqpo6EzKslOW1d+yHL3k6+B3AIc/guXliI4fKfIIGbdcEMTBqTkhzc/
21 | HuXgxaR8V1UfSMoH2+nvWE8CgYAcw4MP3JF1cYATGA6ZMmdoZd/Rv6sWowF1HpOS
22 | 4nf/VCl0Fll1caIfdqyTAfa8sfRA0fKoOYfeR2k1WMnqPL3LK1jj0bFxQ2ftT4SY
23 | N9jyFe/kpCk4bxt2kUgoKMkEY6ZCxmNfao3j7E7pynk217xaC6tFzOnsIU7liaDz
24 | CnyrgQKBgQDtjairs6ehaqRu8Uk44gQoNIlReJ8qp7YmfPlK8ylFNTALs37c4308
25 | Qbjp+jLt7w+XMYnNaZPSNN1mt6EyWFSqUc+5QbfQpbw1cZRI1UBIQDwJjZUS04Ou
26 | H75Rif72nQxHh9Ly5CMNCEyioin7kq945vQbyAwyEr7+tomhUZaq9g==
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/build/README.txt:
--------------------------------------------------------------------------------
1 | Building and publishing images to the amplab account:
2 |   1. make sure IMAGE_PREFIX="" (see build_all.sh)
3 |   2. build_all.sh
4 |   3. set IMAGE_PREFIX="amplab/"
5 |   4. build_all.sh
6 |   5. tag_all.sh
7 |   6. push_all.sh 
8 | 


--------------------------------------------------------------------------------
/build/build_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "$USER" != "root" ]]; then
 4 |     echo "please run as: sudo $0"
 5 |     exit 1
 6 | fi
 7 | 
 8 | CURDIR=$(pwd)
 9 | BASEDIR=$(cd $(dirname $0); pwd)"/.."
10 | dir_list=( "dnsmasq-precise" "apache-hadoop-hdfs-precise" "spark-0.7.3" "shark-0.7.0" "spark-0.8.0" "spark-0.9.0" "shark-0.8.0" )
11 | 
12 | export IMAGE_PREFIX=""
13 | #"amplab/"
14 | 
15 | # NOTE: the order matters but this is the right one
16 | for i in ${dir_list[@]}; do
17 | 	echo building $i;
18 | 	cd ${BASEDIR}/$i
19 |         cat build
20 |         ./build
21 | done
22 | cd $CURDIR
23 | 


--------------------------------------------------------------------------------
/build/push_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "$USER" != "root" ]]; then
 4 |     echo "please run as: sudo $0"
 5 |     exit 1
 6 | fi
 7 | 
 8 | image_list=( "apache-hadoop-hdfs-precise" "dnsmasq-precise" "spark-master" "spark-worker" "spark-shell" "shark-master" "shark-worker" "shark-shell" )
 9 | 
10 | IMAGE_PREFIX="amplab/"
11 | 
12 | # NOTE: the order matters but this is the right one
13 | for i in ${image_list[@]}; do
14 | 	echo docker push ${IMAGE_PREFIX}${i}
15 |         docker push ${IMAGE_PREFIX}${i}
16 | done
17 | 


--------------------------------------------------------------------------------
/build/tag_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "$USER" != "root" ]]; then
 4 |     echo "please run as: sudo $0"
 5 |     exit 1
 6 | fi
 7 | 
 8 | image_list=("spark-master:0.9.0" "spark-worker:0.9.0" "spark-shell:0.9.0" "shark-master:0.8.0" "shark-worker:0.8.0" "shark-shell:0.8.0" )
 9 | 
10 | IMAGE_PREFIX="amplab/"
11 | 
12 | # NOTE: the order matters but this is the right one
13 | for i in ${image_list[@]}; do
14 | 	image=$(echo $i | awk -F ":" '{print $1}')
15 |         echo docker tag ${IMAGE_PREFIX}${i} ${IMAGE_PREFIX}${image}:latest
16 | 	docker tag ${IMAGE_PREFIX}${i} ${IMAGE_PREFIX}${image}:latest
17 | done
18 | 


--------------------------------------------------------------------------------
/deploy/deploy.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | DEBUG=0
  4 | BASEDIR=$(cd $(dirname $0); pwd)
  5 | 
  6 | spark_images=( "amplab/spark:0.9.0" "amplab/spark:0.9.1" "amplab/spark:1.0.0")
  7 | shark_images=( "amplab/shark:0.8.0" )
  8 | NAMESERVER_IMAGE="amplab/dnsmasq-precise"
  9 | 
 10 | start_shell=0
 11 | VOLUME_MAP=""
 12 | 
 13 | image_type="?"
 14 | image_version="?"
 15 | NUM_WORKERS=2
 16 | 
 17 | source $BASEDIR/start_nameserver.sh
 18 | source $BASEDIR/start_spark_cluster.sh
 19 | 
 20 | function check_root() {
 21 |     if [[ "$USER" != "root" ]]; then
 22 |         echo "please run as: sudo $0"
 23 |         exit 1
 24 |     fi
 25 | }
 26 | 
 27 | function print_help() {
 28 |     echo "usage: $0 -i <image> [-w <#workers>] [-v <data_directory>] [-c]"
 29 |     echo ""
 30 |     echo "  image:    spark or shark image from:"
 31 |     echo -n "               "
 32 |     for i in ${spark_images[@]}; do
 33 |         echo -n "  $i"
 34 |     done
 35 |     echo ""
 36 |     echo -n "               "
 37 |     for i in ${shark_images[@]}; do
 38 |         echo -n "  $i"
 39 |     done
 40 |     echo ""
 41 | }
 42 | 
 43 | function parse_options() {
 44 |     while getopts "i:w:cv:h" opt; do
 45 |         case $opt in
 46 |         i)
 47 |             echo "$OPTARG" | grep "spark:" > /dev/null;
 48 | 	    if [ "$?" -eq 0 ]; then
 49 |                 image_type="spark"
 50 |             fi
 51 |             echo "$OPTARG" | grep "shark:" > /dev/null;
 52 |             if [ "$?" -eq 0 ]; then
 53 |                 image_type="shark"
 54 |             fi
 55 | 	    image_name=$(echo "$OPTARG" | awk -F ":" '{print $1}')
 56 |             image_version=$(echo "$OPTARG" | awk -F ":" '{print $2}') 
 57 |           ;;
 58 |         w)
 59 |             NUM_WORKERS=$OPTARG
 60 |           ;;
 61 |         h)
 62 |             print_help
 63 |             exit 0
 64 |           ;;
 65 |         c)
 66 |             start_shell=1
 67 |           ;;
 68 |         v)
 69 |             VOLUME_MAP=$OPTARG
 70 |           ;;
 71 |         esac
 72 |     done
 73 | 
 74 |     if [ "$image_type" == "?" ]; then
 75 |         echo "missing or invalid option: -i <image>"
 76 |         exit 1
 77 |     fi
 78 | 
 79 |     if [ ! "$VOLUME_MAP" == "" ]; then
 80 |         echo "data volume chosen: $VOLUME_MAP"
 81 |         VOLUME_MAP="-v $VOLUME_MAP:/data"
 82 |     fi
 83 | }
 84 | 
 85 | check_root
 86 | 
 87 | if [[ "$#" -eq 0 ]]; then
 88 |     print_help
 89 |     exit 1
 90 | fi
 91 | 
 92 | parse_options $@
 93 | 
 94 | if [ "$image_type" == "spark" ]; then
 95 |     SPARK_VERSION="$image_version"
 96 |     echo "*** Starting Spark $SPARK_VERSION ***"
 97 | elif [ "$image_type" == "shark" ]; then
 98 |     SHARK_VERSION="$image_version"
 99 |     # note: we currently don't have a Shark 0.9 image but it's safe Spark
100 |     # to Shark's version for all but Shark 0.7.0
101 |     if [ "$SHARK_VERSION" == "0.9.0" ] || [ "$SHARK_VERSION" == "0.8.0" ]; then
102 |         SPARK_VERSION="$SHARK_VERSION"
103 |     else
104 |         SPARK_VERSION="0.7.3"
105 |     fi
106 |     echo "*** Starting Shark $SHARK_VERSION + Spark ***"
107 | else
108 |     echo "not starting anything"
109 |     exit 0
110 | fi
111 | 
112 | start_nameserver $NAMESERVER_IMAGE
113 | wait_for_nameserver
114 | start_master ${image_name}-master $image_version
115 | wait_for_master
116 | if [ "$image_type" == "spark" ]; then
117 |     SHELLCOMMAND="sudo $BASEDIR/start_shell.sh -i ${image_name}-shell:$SPARK_VERSION -n $NAMESERVER $VOLUME_MAP"
118 | elif [ "$image_type" == "shark" ]; then
119 |     SHELLCOMMAND="sudo $BASEDIR/start_shell.sh -i ${image_name}-shell:$SHARK_VERSION -n $NAMESERVER $VOLUME_MAP"
120 | fi
121 | 
122 | start_workers ${image_name}-worker $image_version
123 | get_num_registered_workers
124 | echo -n "waiting for workers to register "
125 | until [[  "$NUM_REGISTERED_WORKERS" == "$NUM_WORKERS" ]]; do
126 |     echo -n "."
127 |     sleep 1
128 |     get_num_registered_workers
129 | done
130 | echo ""
131 | print_cluster_info "$SHELLCOMMAND"
132 | if [[ "$start_shell" -eq 1 ]]; then
133 |     SHELL_ID=$($SHELLCOMMAND | tail -n 1 | awk '{print $4}')
134 |     sudo docker attach $SHELL_ID
135 | fi
136 | 


--------------------------------------------------------------------------------
/deploy/kill_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function kill_containers() {
 4 |     containers=($1)
 5 |     for i in "${containers[@]}"; do
 6 |         echo "killing container $i"
 7 |         sudo docker kill "$i"
 8 |     done
 9 | }
10 | 
11 | if [ "$#" -ne "1" ]; then
12 |     echo -e "usage:\n   $0 spark\n   $0 shark\n   $0 mesos\n   $0 nameserver"
13 |     exit 1;
14 | fi
15 | 
16 | if [[ "$USER" != "root" ]]; then
17 |    echo "please run as: sudo $0"
18 |    exit 1
19 | fi
20 | 
21 | clustertype=$1
22 | 
23 | if [[ "$clustertype" == "nameserver" ]]; then
24 |     nameserver=$(sudo docker ps | grep dnsmasq_files | awk '{print $1}' | tr '\n' ' ')
25 |     kill_containers "$nameserver"
26 | else
27 |     master=$(sudo docker ps | grep ${clustertype}_master | awk '{print $1}' | tr '\n' ' ')
28 |     workers=$(sudo docker ps | grep ${clustertype}_worker | awk '{print $1}' | tr '\n' ' ')
29 |     shells=$(sudo docker ps | grep ${clustertype}_shell | awk '{print $1}' | tr '\n' ' ')
30 |     kill_containers "$master"
31 |     kill_containers "$workers" 
32 |     kill_containers "$shells"
33 | fi
34 | 
35 | 


--------------------------------------------------------------------------------
/deploy/start_nameserver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAMESERVER=-1
 4 | NAMESERVER_IP=
 5 | DOMAINNAME=
 6 | #".mycluster.com"
 7 | 
 8 | # starts the dnsmasq nameserver
 9 | function start_nameserver() {
10 |     DNSDIR="/tmp/dnsdir_$RANDOM"
11 |     DNSFILE="${DNSDIR}/0hosts"
12 |     mkdir $DNSDIR
13 | 
14 |     echo "starting nameserver container"
15 |     if [ "$DEBUG" -gt 0 ]; then
16 |         echo sudo docker run -d -h nameserver${DOMAINNAME} -v $DNSDIR:/etc/dnsmasq.d $1
17 |     fi
18 |     NAMESERVER=$(sudo docker run -d -h nameserver${DOMAINNAME} -v $DNSDIR:/etc/dnsmasq.d $1)
19 | 
20 |     if [ "$NAMESERVER" = "" ]; then
21 |         echo "error: could not start nameserver container from image $1"
22 |         exit 1
23 |     fi
24 | 
25 |     echo "started nameserver container:  $NAMESERVER"
26 |     echo "DNS host->IP file mapped:      $DNSFILE"
27 |     sleep 2
28 |     NAMESERVER_IP=$(sudo docker logs $NAMESERVER 2>&1 | egrep '^NAMESERVER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .")
29 |     echo "NAMESERVER_IP:                 $NAMESERVER_IP"
30 |     echo "address=\"/nameserver/$NAMESERVER_IP\"" > $DNSFILE
31 | }
32 | 
33 | # contact nameserver container and resolve IP address (used for checking whether nameserver has registered
34 | # presence of new container). note: only returns exit code
35 | function check_hostname() {
36 |     local __resultvar=$1
37 |     local val_hostname=$2
38 |     local val_expected_ip=$3
39 |     if which dig >/dev/null; then
40 |         DNSCMD="dig $val_hostname @${NAMESERVER_IP} | grep ANSWER -A1 | grep $val_expected_ip > /dev/null"
41 |     else
42 |         DNSCMD="nslookup $val_hostname $NAMESERVER_IP | grep Address | tail -n 1 | grep $val_expected_ip > /dev/null"
43 |     fi
44 |     #echo "DNSCMD: $DNSCMD"
45 |     eval $DNSCMD
46 |     eval $__resultvar=$?
47 | }
48 | 
49 | # contact nameserver container and resolve IP address
50 | function resolve_hostname() {
51 |     local __resultvar=$1
52 |     local val_hostname=$2
53 |     if which dig >/dev/null; then
54 |         DNSCMD="dig $val_hostname @${NAMESERVER_IP} | grep ANSWER -A1 | tail -n 1 | awk '{print \$5}'"
55 |     else
56 |         DNSCMD="nslookup $val_hostname $NAMESERVER_IP | grep Address | tail -n 1 | awk -F":" '{print \$2}' | awk '{print \$1}'"
57 |     fi
58 |     #echo "DNSCMD: $DNSCMD"
59 |     tmpval=$(eval "$DNSCMD")
60 |     eval $__resultvar="$tmpval"
61 | }
62 | 
63 | function wait_for_nameserver {
64 |     echo -n "waiting for nameserver to come up "
65 |     # Note: the original scripts assumed the nameserver resolves its own
66 |     # hostname to 127.0.0.1
67 |     # With newer versions of Docker that is not necessarily the case anymore.
68 |     # Thanks to bmustafa (24601 on GitHub) for reporting and proposing a fix!
69 |     check_hostname result nameserver "$NAMESERVER_IP"
70 |     until [ "$result" -eq 0 ]; do
71 |         echo -n "."
72 |         sleep 1
73 |         check_hostname result nameserver "$NAMESERVER_IP"
74 |     done
75 |     echo ""
76 | }
77 | 


--------------------------------------------------------------------------------
/deploy/start_shell.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | BASEDIR=$(cd $(dirname $0); pwd)
  4 | source $BASEDIR/start_nameserver.sh
  5 | 
  6 | SHELL_ID=-1
  7 | SHELL_IP=
  8 | NAMESERVER_IP=
  9 | NAMESERVER_DIR=
 10 | NAMESERVER_ID=-1
 11 | 
 12 | image_type="?"
 13 | 
 14 | DEBUG=1
 15 | 
 16 | # TODO: remove redundant image list definition (source from file common to deploy.sh)
 17 | spark_shell_images=( "amplab/spark-shell:0.9.0" "amplab/spark-shell:0.9.1" "amplab/spark-shell:1.0.0")
 18 | shark_shell_images=( "amplab/shark-shell:0.8.0" )
 19 | 
 20 | # TODO: unify with deploy.sh
 21 | function check_root() {
 22 |     if [[ "$USER" != "root" ]]; then
 23 |         echo "please run as: sudo $0"
 24 |         exit 1
 25 |     fi
 26 | }
 27 | 
 28 | function print_help() {
 29 |     echo "usage: $0 -i <image> -n <nameserver_container> [-v <data_directory>]"
 30 |     echo ""
 31 |     echo "  image:    spark or shark image from:"
 32 |     echo -n "               "
 33 |     for i in ${spark_shell_images[@]}; do
 34 |         echo -n "  $i"
 35 |     done
 36 |     echo ""
 37 |     echo -n "               "
 38 |     for i in ${shark_shell_images[@]}; do
 39 |         echo -n "  $i"
 40 |     done
 41 |     echo ""
 42 | }
 43 | 
 44 | function parse_options() {
 45 |     while getopts "i:n:v:h" opt; do
 46 |         case $opt in
 47 |         i)
 48 |             echo "$OPTARG" | grep "spark-shell:" > /dev/null;
 49 |             if [ "$?" -eq 0 ]; then
 50 |                 image_type="spark"
 51 |             fi
 52 |             echo "$OPTARG" | grep "shark-shell:" > /dev/null;
 53 |             if [ "$?" -eq 0 ]; then
 54 |                 image_type="shark"
 55 |             fi
 56 |             image_name=$(echo "$OPTARG" | awk -F ":" '{print $1}')
 57 |             image_version=$(echo "$OPTARG" | awk -F ":" '{print $2}') 
 58 |           ;;
 59 |         h)
 60 |             print_help
 61 |             exit 0
 62 |           ;;
 63 |         v)
 64 |             VOLUME_MAP=$OPTARG
 65 |           ;;
 66 |         n)
 67 |             NAMESERVER_ID=$OPTARG
 68 |           ;;
 69 |         esac
 70 |     done
 71 | 
 72 |     if [ "$image_type" == "?" ]; then
 73 |         echo "missing or invalid option: -i <image>"
 74 |         exit 1
 75 |     fi
 76 | 
 77 |     if [ ! "$VOLUME_MAP" == "" ]; then
 78 |         echo "data volume chosen: $VOLUME_MAP"
 79 |         VOLUME_MAP="-v $VOLUME_MAP:/data"
 80 |     fi
 81 | }
 82 | 
 83 | # TODO: generalize and refactor this with the code for updating
 84 | # master and worker nameserver entries.
 85 | function set_nameserver_data() {
 86 |     IMAGENAME="$image_name:$image_version"
 87 |     DNSDIR=$(sudo docker inspect $NAMESERVER_ID | \
 88 |         grep dnsdir | awk '{print $2}' | tr -d '":')
 89 |     DNSFILE="${DNSDIR}/0hosts"
 90 |     SHELL_IP=$(docker inspect $SHELL_ID | \
 91 |         grep IPAddress | awk '{print $2}' | tr -d '":,')
 92 | 
 93 |     if [ "$DEBUG" -gt 0 ]; then
 94 |         echo "NAMESERVER_IP:                 $NAMESERVER_IP"
 95 |         echo "DNSFILE:                       $DNSFILE"
 96 |         echo "SHELL_IP:                      $SHELL_IP"
 97 |         echo "SHELL_HOSTNAME:                $SHELL_HOSTNAME"
 98 |     fi
 99 | 
100 |     echo "address=\"/$SHELL_HOSTNAME/$SHELL_IP\"" | sudo tee -a $DNSFILE > /dev/null
101 | }
102 | 
103 | # starts the spark/shark shell container
104 | function start_shell() {
105 |     IMAGENAME="$image_name:$image_version"
106 |     NAMESERVER_IP=$(docker inspect $NAMESERVER_ID | \
107 |         grep IPAddress | awk '{print $2}' | tr -d '":,')
108 | 
109 |     if [ "$NAMESERVER_IP" = "" ]; then
110 |         echo "error: cannot determine nameserver IP"
111 |         exit 1
112 |     fi
113 | 
114 |     #MASTER_IP=$(dig master @$NAMESERVER_IP | grep ANSWER -A1 | \
115 |     #    tail -n 1 | awk '{print $5}')
116 |     resolve_hostname MASTER_IP master
117 | 
118 |     if [ "$MASTER_IP" = "" ]; then
119 |         echo "error: cannot determine master IP"
120 |         exit 1
121 |     fi
122 | 
123 |     SHELL_HOSTNAME="shell$RANDOM"
124 |     echo "starting shell container"
125 |     if [ "$DEBUG" -gt 0 ]; then
126 |         echo sudo docker run -i -t -d --dns $NAMESERVER_IP -h $SHELL_HOSTNAME $VOLUME_MAP $IMAGENAME $MASTER_IP
127 |     fi
128 |     SHELL_ID=$(sudo docker run -i -t -d --dns $NAMESERVER_IP -h $SHELL_HOSTNAME $VOLUME_MAP $IMAGENAME $MASTER_IP)
129 | 
130 |     if [ "$SHELL_ID" = "" ]; then
131 |         echo "error: could not start shell container from image $IMAGENAME"
132 |         exit 1
133 |     fi
134 | }
135 | 
136 | check_root
137 | 
138 | if [[ "$#" -eq 0 ]]; then
139 |     print_help
140 |     exit 1
141 | fi
142 | 
143 | parse_options $@
144 | 
145 | if [ "$image_type" == "spark" ]; then
146 |     SPARK_VERSION="$image_version"
147 |     echo "*** Starting Spark $SPARK_VERSION Shell ***"
148 | elif [ "$image_type" == "shark" ]; then
149 |     SHARK_VERSION="$image_version"
150 |     # note: we currently don't have a Shark 0.9 image but it's safe Spark
151 |     # to Shark's version for all but Shark 0.7.0
152 |     if [ "$SHARK_VERSION" == "0.9.0" ] || [ "$SHARK_VERSION" == "0.8.0" ]; then
153 |         SPARK_VERSION="$SHARK_VERSION"
154 |     else
155 |         SPARK_VERSION="0.7.3"
156 |     fi
157 |     echo "*** Starting Shark $SHARK_VERSION + Spark Shell ***"
158 | else
159 |     echo "not starting anything"
160 |     exit 0
161 | fi
162 | 
163 | start_shell
164 | 
165 | sleep 2
166 | 
167 | set_nameserver_data
168 | 
169 | echo -n "waiting for nameserver to find shell "
170 | SHELL_IP=$(docker inspect $SHELL_ID | \
171 |     grep IPAddress | awk '{print $2}' | tr -d '":,')
172 | 
173 | check_hostname result $SHELL_HOSTNAME $SHELL_IP
174 | until [ "$result" -eq 0 ]; do
175 |     echo -n "."
176 |     sleep 1
177 |     check_hostname result $SHELL_HOSTNAME $SHELL_IP
178 | done
179 | 
180 | echo ""
181 | echo "***************************************************************"
182 | echo "connect to shell via:"
183 | echo "sudo docker attach $SHELL_ID"
184 | 
185 | 


--------------------------------------------------------------------------------
/deploy/start_spark_cluster.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | MASTER=-1
  4 | MASTER_IP=
  5 | NUM_REGISTERED_WORKERS=0
  6 | 
  7 | # starts the Spark/Shark master container
  8 | function start_master() {
  9 |     echo "starting master container"
 10 |     if [ "$DEBUG" -gt 0 ]; then
 11 |         echo sudo docker run -d --dns $NAMESERVER_IP -h master${DOMAINNAME} $VOLUME_MAP $1:$2
 12 |     fi
 13 |     MASTER=$(sudo docker run -d --dns $NAMESERVER_IP -h master${DOMAINNAME} $VOLUME_MAP $1:$2)
 14 | 
 15 |     if [ "$MASTER" = "" ]; then
 16 |         echo "error: could not start master container from image $1:$2"
 17 |         exit 1
 18 |     fi
 19 | 
 20 |     echo "started master container:      $MASTER"
 21 |     sleep 3
 22 |     MASTER_IP=$(sudo docker logs $MASTER 2>&1 | egrep '^MASTER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .")
 23 |     echo "MASTER_IP:                     $MASTER_IP"
 24 |     echo "address=\"/master/$MASTER_IP\"" >> $DNSFILE
 25 | }
 26 | 
 27 | # starts a number of Spark/Shark workers
 28 | function start_workers() {
 29 |     for i in `seq 1 $NUM_WORKERS`; do
 30 |         echo "starting worker container"
 31 | 	hostname="worker${i}${DOMAINNAME}"
 32 |         if [ "$DEBUG" -gt 0 ]; then
 33 | 	    echo sudo docker run -d --dns $NAMESERVER_IP -h $hostname $VOLUME_MAP $1:$2 ${MASTER_IP}
 34 |         fi
 35 | 	WORKER=$(sudo docker run -d --dns $NAMESERVER_IP -h $hostname $VOLUME_MAP $1:$2 ${MASTER_IP})
 36 | 
 37 |         if [ "$WORKER" = "" ]; then
 38 |             echo "error: could not start worker container from image $1:$2"
 39 |             exit 1
 40 |         fi
 41 | 
 42 | 	echo "started worker container:  $WORKER"
 43 | 	sleep 3
 44 | 	WORKER_IP=$(sudo docker logs $WORKER 2>&1 | egrep '^WORKER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .")
 45 | 	echo "address=\"/$hostname/$WORKER_IP\"" >> $DNSFILE
 46 |     done
 47 | }
 48 | 
 49 | # prints out information on the cluster
 50 | function print_cluster_info() {
 51 |     BASEDIR=$(cd $(dirname $0); pwd)"/.."
 52 |     echo ""
 53 |     echo "***********************************************************************"
 54 |     echo "start shell via:            $1"
 55 |     echo ""
 56 |     echo "visit Spark WebUI at:       http://$MASTER_IP:8080/"
 57 |     echo "visit Hadoop Namenode at:   http://$MASTER_IP:50070"
 58 |     echo "ssh into master via:        ssh -i $BASEDIR/apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${MASTER_IP}"
 59 |     echo ""
 60 |     echo "/data mapped:               $VOLUME_MAP"
 61 |     echo ""
 62 |     echo "kill master via:           sudo docker kill $MASTER"
 63 |     echo "***********************************************************************"
 64 |     echo ""
 65 |     echo "to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:"
 66 |     echo "nameserver $NAMESERVER_IP"
 67 | }
 68 | 
 69 | function get_num_registered_workers() {
 70 |     if [[ "$SPARK_VERSION" == "0.7.3" ]]; then 
 71 |         DATA=$( curl --noproxy -s http://$MASTER_IP:8080/?format=json | tr -d '\n' | sed s/\"/\\\\\"/g)
 72 |     else
 73 | 	# Docker on Mac uses tinycore Linux with busybox which has a limited version wget (?)
 74 | 	echo $(uname -a) | grep "Linux boot2docker" > /dev/null
 75 | 	if [[ "$?" == "0" ]]; then
 76 | 		DATA=$( wget -Y off -q -O - http://$MASTER_IP:8080/json | tr -d '\n' | sed s/\"/\\\\\"/g)
 77 | 	else
 78 |         	DATA=$( wget --no-proxy -q -O - http://$MASTER_IP:8080/json | tr -d '\n' | sed s/\"/\\\\\"/g)
 79 | 	fi
 80 |     fi
 81 |     NUM_REGISTERED_WORKERS=$(python -c "import json; data = \"$DATA\"; value = json.loads(data); print len(value['workers'])")
 82 | }
 83 | 
 84 | function wait_for_master {
 85 |     if [[ "$SPARK_VERSION" == "0.7.3" ]]; then
 86 |         query_string="INFO HttpServer: akka://sparkMaster/user/HttpServer started"
 87 |     elif [[ "$SPARK_VERSION" == "1.0.0" ]]; then
 88 |         query_string="MasterWebUI: Started MasterWebUI"
 89 |     else
 90 |         query_string="MasterWebUI: Started Master web UI"
 91 |     fi
 92 |     echo -n "waiting for master "
 93 |     sudo docker logs $MASTER | grep "$query_string" > /dev/null
 94 |     until [ "$?" -eq 0 ]; do
 95 |         echo -n "."
 96 |         sleep 1
 97 |         sudo docker logs $MASTER | grep "$query_string" > /dev/null;
 98 |     done
 99 |     echo ""
100 |     echo -n "waiting for nameserver to find master "
101 |     check_hostname result master "$MASTER_IP"
102 |     until [ "$result" -eq 0 ]; do
103 |         echo -n "."
104 |         sleep 1
105 |         check_hostname result master "$MASTER_IP"
106 |     done
107 |     echo ""
108 |     sleep 3
109 | }
110 | 


--------------------------------------------------------------------------------
/dnsmasq-precise/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:precise
 2 | 
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | VOLUME [ "/etc/dnsmasq.d" ]
 6 | 
 7 | RUN apt-get install -y dnsmasq-base
 8 | 
 9 | RUN echo "user=root" > /etc/dnsmasq.conf
10 | RUN echo "listen-address=__LOCAL_IP__" >> /etc/dnsmasq.conf
11 | RUN echo "resolv-file=/etc/resolv.dnsmasq.conf" >> /etc/dnsmasq.conf
12 | RUN echo "conf-dir=/etc/dnsmasq.d"  >> /etc/dnsmasq.conf
13 | RUN echo "domain=cluster.com"  >> /etc/dnsmasq.conf
14 | 
15 | RUN echo "nameserver 8.8.8.8" >> /etc/resolv.dnsmasq.conf
16 | 
17 | ADD files /root/dnsmasq_files
18 | 
19 | CMD ["/root/dnsmasq_files/default_cmd"]
20 | 


--------------------------------------------------------------------------------
/dnsmasq-precise/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}dnsmasq-precise .
5 | 


--------------------------------------------------------------------------------
/dnsmasq-precise/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 4 | echo "NAMESERVER_IP=$IP"
 5 | 
 6 | sed -i s/__LOCAL_IP__/$IP/ /etc/dnsmasq.conf
 7 | 
 8 | dnsmasq
 9 | 
10 | while [ 1 ];
11 | do
12 |     sleep 3
13 |     # kill and restart dnsmasq every three seconds
14 |     # in case its configuration has changed
15 |     pkill dnsmasq
16 |     dnsmasq
17 | done
18 | 


--------------------------------------------------------------------------------
/mesos/NOTE.txt:
--------------------------------------------------------------------------------
1 | For build place pre-compiled mesos installation into file:
2 | mesos/mesos-base/files/mesos.tgz
3 | 


--------------------------------------------------------------------------------
/mesos/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mesos_dirs=$(ls -d mesos* spark-shell shark-shell)
 4 | dir_list=("$mesos_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/mesos/deploy/deploy:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # determines which Mesos image is chosen
 4 | MESOS_VERSION=0.13.0
 5 | 
 6 | # set this value to the number of workers you want
 7 | NUM_WORKERS=2
 8 | 
 9 | if [[ "$USER" != "root" ]]; then
10 |    echo "please run as: sudo $0"
11 |    exit 1
12 | fi
13 | 
14 | source ../../dnsmasq-precise/deploy/start_nameserver.sh
15 | source start_mesos_cluster.sh
16 | 
17 | echo "*** Starting Mesos $MESOS_VERSION ***"
18 | start_nameserver
19 | sleep 5
20 | start_mesos_master
21 | sleep 40
22 | start_mesos_workers
23 | sleep 3
24 | print_cluster_info
25 | 
26 | 


--------------------------------------------------------------------------------
/mesos/deploy/start_mesos_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MASTER=-1
 4 | MASTER_IP=
 5 | 
 6 | # starts the Mesos master container
 7 | function start_mesos_master() {
 8 |     echo "starting Mesos master container"
 9 |     MASTER=$(sudo docker run -i -t -d -dns $NAMESERVER_IP -h master mesos-master:$MESOS_VERSION)
10 |     echo "started master container:      $MASTER"
11 |     sleep 3
12 |     MASTER_IP=$(sudo docker logs $MASTER 2>&1 | egrep '^MASTER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .")
13 |     echo "MASTER_IP:                     $MASTER_IP"
14 |     echo "address=\"/master/$MASTER_IP\"" >> $DNSFILE
15 | }
16 | 
17 | # starts a number of Mesos workers
18 | function start_mesos_workers() {
19 |     for i in `seq 1 $NUM_WORKERS`; do
20 |         echo "starting Mesos worker container"
21 |         hostname="worker${i}"
22 |         WORKER=$(sudo docker run -d -dns $NAMESERVER_IP -h $hostname mesos-worker:${MESOS_VERSION} ${MASTER_IP} ${MASTER_IP}:5050)
23 |         echo "started worker container:  $WORKER"
24 |         sleep 3
25 |         WORKER_IP=$(sudo docker logs $WORKER 2>&1 | egrep '^WORKER_IP=' | awk -F= '{print $2}' | tr -d -c "[:digit:] .")
26 |         echo "address=\"/$hostname/$WORKER_IP\"" >> $DNSFILE
27 |     done
28 | }
29 | 
30 | # prints out information on the cluster
31 | function print_cluster_info() {
32 |     echo ""
33 |     echo "***********************************************************************"
34 |     echo "visit Mesos WebUI at:       http://$MASTER_IP:5050/"
35 |     echo "visit Hadoop Namenode at:   http://$MASTER_IP:50070"
36 |     echo ""
37 |     echo "start Spark Shell:          sudo docker run -i -t -dns $NAMESERVER_IP -h spark-client spark-shell-mesos:0.7.3 $MASTER_IP"
38 |     echo "start Shark Shell:          sudo docker run -i -t -dns $NAMESERVER_IP -h shark-client shark-shell-mesos:0.7.0 $MASTER_IP"
39 |     echo ""
40 |     echo "ssh into master via:        ssh -i ../../apache-hadoop-hdfs-precise/files/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${MASTER_IP}"
41 |     echo ""
42 |     echo "kill cluster via:           docker/kill_all"
43 |     echo "***********************************************************************"
44 |     echo ""
45 |     echo "to enable cluster name resolution add the following line to _the top_ of your host's /etc/resolv.conf:"
46 |     echo "nameserver $NAMESERVER_IP"
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/mesos/mesos-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Ubuntu Precise 12.04 LTS image
 2 | #
 3 | FROM amplab/shark-base:0.7.0
 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 5 | 
 6 | #RUN apt-get install -y libcurl4-openssl-dev
 7 | RUN apt-get install -y libcurl3
 8 | 
 9 | # add Hadoop config file templates
10 | # NOTE: we rather do this as a single ADD statement
11 | # since we are running into
12 | #       Error build: Unable to mount using aufs
13 | #       Unable to mount using aufs
14 | # issue. For more information see
15 | #       https://github.com/dotcloud/docker/issues/1171
16 | ADD files /root/mesos_files
17 | 
18 | RUN (mv /root/mesos_files/mesos.tgz / && cd / && gunzip < mesos.tgz)|(cd /opt && tar -xvf -) && (rm /mesos.tgz && ln -s /opt/mesos /tmp/mesos)
19 | 
20 | 


--------------------------------------------------------------------------------
/mesos/mesos-base/build:
--------------------------------------------------------------------------------
1 | sudo docker build -t amplab/mesos-base:0.13.0 .
2 | 


--------------------------------------------------------------------------------
/mesos/mesos-base/files/configure_mesos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/shark_files/configure_shark.sh
 4 | 
 5 | function create_mesos_directories() {
 6 |     create_shark_directories
 7 |     mkdir /tmp/mesos
 8 |     chown hdfs.hdfs /tmp/mesos
 9 | }
10 | 
11 | function deploy_mesos_files() {
12 |     deploy_shark_files
13 | }		
14 | 
15 | function configure_mesos() {
16 |     configure_shark $1
17 |     sed -i s/"^export MASTER="/"#export MASTER="/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
18 |     echo "export MASTER=mesos://$1:5050" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh
19 |     echo "export MESOS_NATIVE_LIBRARY=/opt/mesos/lib/libmesos-0.13.0.so" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh
20 |     echo "export JAVA_LIBRARY_PATH=/opt/mesos/lib/libmesos-0.13.0.so" >> /opt/spark-$SPARK_VERSION/conf/spark-env.sh
21 | }
22 | 
23 | function prepare_mesos() {
24 |     create_mesos_directories
25 |     deploy_mesos_files
26 |     configure_mesos $1
27 | }
28 | 


--------------------------------------------------------------------------------
/mesos/mesos-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Mesos
 2 | FROM amplab/mesos-base:0.13.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Setup a volume for data
 6 | #VOLUME ["/data"]
 7 | 
 8 | ADD files /root/mesos_master_files
 9 | 
10 | CMD ["/root/mesos_master_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/mesos/mesos-master/build:
--------------------------------------------------------------------------------
1 | sudo docker build -t amplab/mesos-master:0.13.0 .
2 | 


--------------------------------------------------------------------------------
/mesos/mesos-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/mesos_files/configure_mesos.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Mesos"
11 | prepare_mesos $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format
15 | service hadoop-namenode start
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Mesos Master"
23 | cp /root/mesos_master_files/run_mesos_master.sh /
24 | chmod a+rx /run_mesos_master.sh
25 | sudo -u hdfs LD_LIBRARY_PATH=$LD_LIBRARY_PATH /run_mesos_master.sh $IP
26 | 


--------------------------------------------------------------------------------
/mesos/mesos-master/files/run_mesos_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export LD_LIBRARY_PATH=/usr/lib/jvm/java-7-openjdk-amd64/jre/lib/amd64/server
3 | cd /opt/mesos/sbin && ./mesos-master --ip=$1
4 | 


--------------------------------------------------------------------------------
/mesos/mesos-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Mesos
 2 | FROM amplab/mesos-base:0.13.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Setup a volume for data
 6 | #VOLUME ["/data"]
 7 | 
 8 | ADD files /root/mesos_worker_files
 9 | 
10 | # Add the entrypoint script for the master
11 | CMD ["-h"]
12 | ENTRYPOINT ["/root/mesos_worker_files/default_cmd"]
13 | 


--------------------------------------------------------------------------------
/mesos/mesos-worker/build:
--------------------------------------------------------------------------------
1 | sudo docker build -t amplab/mesos-worker:0.13.0 .
2 | 


--------------------------------------------------------------------------------
/mesos/mesos-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/mesos_files/configure_mesos.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Mesos"
 9 | prepare_mesos $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Mesos Worker"
20 | cp /root/mesos_worker_files/run_mesos_worker.sh /
21 | chmod a+rx /run_mesos_worker.sh
22 | sudo -u hdfs HADOOP_HOME=$HADOOP_HOME LD_LIBRARY_PATH=$LD_LIBRARY_PATH /run_mesos_worker.sh $2 $IP
23 | 


--------------------------------------------------------------------------------
/mesos/mesos-worker/files/run_mesos_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export LD_LIBRARY_PATH=/usr/lib/jvm/java-7-openjdk-amd64/jre/lib/amd64/server
3 | cd /opt/mesos/sbin && ./mesos-slave --master=$1 --ip=$2 --hadoop_home=$HADOOP_HOME
4 | 


--------------------------------------------------------------------------------
/mesos/shark-shell/Dockerfile:
--------------------------------------------------------------------------------
1 | # Spark
2 | FROM amplab/mesos-base:0.13.0
3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
4 | 
5 | ADD files /root/shark_shell_files
6 | 
7 | # Add the entrypoint script for the master
8 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"]
9 | 


--------------------------------------------------------------------------------
/mesos/shark-shell/build:
--------------------------------------------------------------------------------
1 | sudo docker build -t amplab/shark-shell-mesos:0.7.0 .
2 | 


--------------------------------------------------------------------------------
/mesos/shark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/mesos_files/configure_mesos.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Mesos"
 8 | prepare_mesos $1
 9 | 
10 | echo "starting Shark Shell"
11 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark
12 | 


--------------------------------------------------------------------------------
/mesos/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
1 | # Spark
2 | FROM amplab/mesos-base:0.13.0
3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
4 | 
5 | ADD files /root/spark_shell_files
6 | 
7 | # Add the entrypoint script for the master
8 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
9 | 


--------------------------------------------------------------------------------
/mesos/spark-shell/build:
--------------------------------------------------------------------------------
1 | sudo docker build -t amplab/spark-shell-mesos:0.7.3 .
2 | 


--------------------------------------------------------------------------------
/mesos/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/mesos_files/configure_mesos.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Mesos"
 8 | prepare_mesos $1
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://$1:9000/user/hdfs/test.txt
13 | 
14 | echo "starting Spark Shell"
15 | cd $SPARK_HOME
16 | echo SPARK_HOME: `pwd`
17 | echo SHARK_VERSION: $SHARK_VERSION
18 | if [ "$SPARK_VERSION" == "0.8.0" ] || [ "$SPARK_VERSION" == "0.7.3" ]; then
19 | 	sudo -u hdfs HDFS_PREFIX=hdfs://${1}:9000 ./spark-shell
20 | else
21 | 	sudo -u hdfs HDFS_PREFIX=hdfs://${1}:9000 ./bin/spark-shell
22 | fi	
23 | 


--------------------------------------------------------------------------------
/mesos/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/shark-0.7.0/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | shark_dirs=$(ls -d shark*)
 4 | dir_list=("$shark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.7.3, Shark 0.7.0
 2 | # Version 0.7.0
 3 | #
 4 | # Use spark-base as base
 5 | FROM spark-base:0.7.3
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | # note: SPARK_VERSION should be inherited from spark-base
 9 | # but for some reason isn't (?)
10 | ENV SPARK_VERSION 0.7.3
11 | ENV SHARK_VERSION 0.7.0
12 | ENV HIVE_VERSION 0.9.0
13 | 
14 | # Install Shark
15 | ADD http://spark-project.org/download/shark-${SHARK_VERSION}-hadoop1-bin.tgz /
16 | RUN (cd / && gunzip < shark-${SHARK_VERSION}-hadoop1-bin.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /shark-${SHARK_VERSION}-hadoop1-bin.tgz
18 | 
19 | # Add Shark config files and configure script
20 | ADD files /root/shark_files
21 | 
22 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-base:0.7.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-base/files/configure_shark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | shark_files=( "/root/shark_files/shark-env.sh" )
 6 | hive_files=( "/root/shark_files/hive-site.xml" "/etc/hadoop/core-site.xml" )
 7 | 
 8 | function create_shark_directories() {
 9 |     create_spark_directories
10 |     rm -rf /opt/metastore
11 |     mkdir /opt/metastore
12 |     chown hdfs.hdfs /opt/metastore
13 | }
14 | 
15 | function deploy_shark_files() {
16 |     deploy_spark_files
17 |     for i in "${hive_files[@]}";
18 |     do
19 |         filename=$(basename $i);
20 |         cp $i /opt/hive-${HIVE_VERSION}-bin/conf/$filename;
21 |     done
22 |     for i in "${shark_files[@]}";
23 |     do
24 | 	filename=$(basename $i);
25 | 	cp $i /opt/shark-${SHARK_VERSION}/conf/$filename;
26 |     done	
27 | }		
28 | 
29 | function configure_shark() {
30 |     configure_spark $1
31 |     # Shark
32 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh
33 |     sed -i s/__HIVE_HOME__/"\/opt\/hive-${HIVE_VERSION}-bin"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh
34 |     # Hive
35 |     sed -i s/__MASTER__/$1/ /opt/hive-0.9.0-bin/conf/hive-site.xml
36 |     #sed -i s/__MASTER__/master/ /opt/hive-0.9.0-bin/conf/hive-site.xml
37 | }
38 | 
39 | function prepare_shark() {
40 |     create_shark_directories
41 |     deploy_shark_files
42 |     configure_shark $1
43 | }
44 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-base/files/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | <property>
 3 | <name>fs.default.name</name>
 4 | <value>hdfs://__MASTER__:9000/</value>
 5 | </property>
 6 | <property>  
 7 | <name>fs.defaultFS</name>
 8 | <value>hdfs://__MASTER__:9000/</value>
 9 | </property>
10 | <property>
11 | <name>mapred.job.tracker</name>
12 | <value>NONE</value>
13 | </property>
14 | <property>
15 |   <name>hive.exec.scratchdir</name>
16 |   <value>/tmp/hive-scratch</value>
17 |   <description>Scratch space for Hive jobs</description>
18 | </property>
19 | <property>
20 |   <name>hive.metastore.local</name>
21 |   <value>true</value>
22 | </property>
23 | <property>
24 |   <name>javax.jdo.option.ConnectionURL</name>
25 |   <value>jdbc:derby:;databaseName=metastore_db;create=true</value>
26 | </property>
27 | <property>
28 |   <name>javax.jdo.option.ConnectionDriverName</name>
29 |   <value>org.apache.derby.jdbc.EmbeddedDriver</value>
30 | </property>
31 | <property>
32 |   <name>hive.metastore.metadb.dir</name>
33 |   <value>file:///opt/metastore/metadb/</value>
34 | </property>
35 | <property>
36 |   <name>hive.metastore.uris</name>
37 |   <value>file:///opt/metastore/metadb/</value>
38 | </property>
39 | <property>
40 |   <name>hive.metastore.warehouse.dir</name>
41 |   <value>hdfs://__MASTER__:9000/user/hdfs/warehouse</value>
42 | </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-base/files/shark-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | . __SPARK_HOME__/conf/spark-env.sh
3 | export SHARK_MASTER_MEM=700m
4 | export HIVE_HOME=__HIVE_HOME__
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark master
 2 | FROM shark-base:0.7.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_master_files
 7 | 
 8 | # Add default command for master
 9 | CMD ["/root/shark_master_files/default_cmd"]
10 | 
11 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-master:0.7.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # run this as:
 4 | #   sudo docker run -i -t -d shark-master:$SHARK_VERSION
 5 | 
 6 | source /root/shark_files/configure_shark.sh
 7 | 
 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 9 | echo "MASTER_IP=$IP"
10 | 
11 | echo "preparing Shark"
12 | prepare_shark $IP
13 | 
14 | echo "starting Hadoop namenode"
15 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
16 | service hadoop-namenode start > /dev/null 2>&1
17 | 
18 | echo "starting sshd"
19 | /usr/sbin/sshd
20 | 
21 | sleep 5
22 | 
23 | echo "starting Shark master"
24 | cp /root/shark_master_files/run_shark_master.sh /
25 | chmod a+rx /run_shark_master.sh
26 | sudo -u hdfs /run_shark_master.sh
27 | #$IP
28 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-master/files/run_shark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/shark-0.7.0/conf/shark-env.sh
3 | export PATH=$PATH:$SCALA_HOME/bin
4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar
5 | #/opt/spark-0.7.3/run spark.deploy.master.Master -i $1
6 | /opt/spark-0.7.3/run spark.deploy.master.Master -i master
7 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark master
 2 | FROM shark-base:0.7.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_shell_files
 7 | 
 8 | # Add default command for master
 9 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"]
10 | 
11 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-shell:0.7.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/shark_files/configure_shark.sh
 4 | prepare_shark $1
 5 | env
 6 | sudo -u hdfs hadoop dfsadmin -safemode wait
 7 | 
 8 | # Note: there are issues if the nameserver did not have time to
 9 | # refresh its cache with this shell's hostname so give him time
10 | # to do so.
11 | sleep 3
12 | 
13 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark
14 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-shell/files/test.shark:
--------------------------------------------------------------------------------
1 | CREATE TABLE src(key INT, value STRING);
2 | LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src;
3 | SELECT COUNT(1) FROM src;
4 | exit;
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark worker
 2 | FROM shark-base:0.7.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_worker_files
 7 | 
 8 | # Add the entrypoint script for the worker
 9 | ENTRYPOINT ["/root/shark_worker_files/default_cmd"]
10 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-worker:0.7.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # run this as:
 4 | #    sudo docker run -d shark-worker:${SHARK_VERSION} ${MASTER_IP} spark://${MASTER_IP}:7077
 5 | 
 6 | source /root/shark_files/configure_shark.sh
 7 | 
 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 9 | echo "WORKER_IP=$IP"
10 | 
11 | echo "preparing Shark"
12 | prepare_shark $1
13 | 
14 | echo "starting Hadoop datanode"
15 | service hadoop-datanode start
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Shark worker node"
23 | cp /root/shark_worker_files/run_shark_worker.sh /
24 | chmod a+rx /run_shark_worker.sh
25 | sudo -u hdfs /run_shark_worker.sh
26 | #$2
27 | 


--------------------------------------------------------------------------------
/shark-0.7.0/shark-worker/files/run_shark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/shark-0.7.0/conf/shark-env.sh
3 | export PATH=$PATH:$SCALA_HOME/bin
4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar
5 | #/opt/spark-0.7.3/run spark.deploy.worker.Worker $1
6 | #/opt/spark-0.7.3/run spark.deploy.worker.Worker -i $(hostname) spark://master:7077
7 | ${SPARK_HOME}/run spark.deploy.worker.Worker spark://master:7077
8 | 


--------------------------------------------------------------------------------
/shark-0.8.0/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | shark_dirs=$(ls -d shark*)
 4 | dir_list=("$shark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.8.0, Shark 0.8.0
 2 | #
 3 | # Use spark-base as base
 4 | FROM spark-base:0.8.0
 5 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 6 | 
 7 | # note: SPARK_VERSION should be inherited from spark-base
 8 | # but for some reason isn't (?)
 9 | ENV SPARK_VERSION 0.8.0
10 | ENV SHARK_VERSION 0.8.0
11 | ENV HIVE_VERSION 0.9.0
12 | 
13 | # Install Shark
14 | ADD https://github.com/amplab/shark/releases/download/v${SHARK_VERSION}/shark-${SHARK_VERSION}-bin-hadoop1.tgz /
15 | RUN (cd / && gunzip < shark-${SHARK_VERSION}-bin-hadoop1.tgz)|(cd /opt && tar -xvf -)
16 | RUN (ln -s /opt/shark-${SHARK_VERSION}-bin-hadoop1/shark-${SHARK_VERSION} /opt/shark-${SHARK_VERSION} && ln -s /opt/shark-${SHARK_VERSION}-bin-hadoop1/hive-${HIVE_VERSION}-shark-${SHARK_VERSION}-bin /opt/hive-${HIVE_VERSION}-bin && rm /shark-${SHARK_VERSION}-bin-hadoop1.tgz)
17 | 
18 | # Add Shark config files and configure script
19 | ADD files /root/shark_files
20 | 
21 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-base:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-base/files/configure_shark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | shark_files=( "/root/shark_files/shark-env.sh" )
 6 | hive_files=( "/root/shark_files/hive-site.xml" "/etc/hadoop/core-site.xml" )
 7 | 
 8 | function create_shark_directories() {
 9 |     create_spark_directories
10 |     rm -rf /opt/metastore
11 |     mkdir /opt/metastore
12 |     chown hdfs.hdfs /opt/metastore
13 | }
14 | 
15 | function deploy_shark_files() {
16 |     deploy_spark_files
17 |     for i in "${hive_files[@]}";
18 |     do
19 |         filename=$(basename $i);
20 |         cp $i /opt/hive-${HIVE_VERSION}-bin/conf/$filename;
21 |     done
22 |     for i in "${shark_files[@]}";
23 |     do
24 | 	filename=$(basename $i);
25 | 	cp $i /opt/shark-${SHARK_VERSION}/conf/$filename;
26 |     done	
27 | }		
28 | 
29 | function configure_shark() {
30 |     configure_spark $1
31 |     # Shark
32 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh
33 |     sed -i s/__HIVE_HOME__/"\/opt\/hive-${HIVE_VERSION}-bin"/ /opt/shark-$SHARK_VERSION/conf/shark-env.sh
34 |     # Hive
35 |     sed -i s/__MASTER__/$1/ /opt/hive-0.9.0-bin/conf/hive-site.xml
36 | }
37 | 
38 | function prepare_shark() {
39 |     create_shark_directories
40 |     deploy_shark_files
41 |     configure_shark $1
42 | }
43 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-base/files/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | <property>
 3 | <name>fs.default.name</name>
 4 | <value>hdfs://__MASTER__:9000/</value>
 5 | </property>
 6 | <property>  
 7 | <name>fs.defaultFS</name>
 8 | <value>hdfs://__MASTER__:9000/</value>
 9 | </property>
10 | <property>
11 | <name>mapred.job.tracker</name>
12 | <value>NONE</value>
13 | </property>
14 | <property>
15 |   <name>hive.exec.scratchdir</name>
16 |   <value>/tmp/hive-scratch</value>
17 |   <description>Scratch space for Hive jobs</description>
18 | </property>
19 | <property>
20 |   <name>hive.metastore.local</name>
21 |   <value>true</value>
22 | </property>
23 | <property>
24 |   <name>javax.jdo.option.ConnectionURL</name>
25 |   <value>jdbc:derby:;databaseName=metastore_db;create=true</value>
26 | </property>
27 | <property>
28 |   <name>javax.jdo.option.ConnectionDriverName</name>
29 |   <value>org.apache.derby.jdbc.EmbeddedDriver</value>
30 | </property>
31 | <property>
32 |   <name>hive.metastore.metadb.dir</name>
33 |   <value>file:///opt/metastore/metadb/</value>
34 | </property>
35 | <property>
36 |   <name>hive.metastore.uris</name>
37 |   <value>file:///opt/metastore/metadb/</value>
38 | </property>
39 | <property>
40 |   <name>hive.metastore.warehouse.dir</name>
41 |   <value>hdfs://__MASTER__:9000/user/hdfs/warehouse</value>
42 | </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-base/files/shark-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | . __SPARK_HOME__/conf/spark-env.sh
3 | export SHARK_MASTER_MEM=700m
4 | export HIVE_HOME=__HIVE_HOME__
5 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark master
 2 | FROM shark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_master_files
 7 | 
 8 | # Add default command for master
 9 | CMD ["/root/shark_master_files/default_cmd"]
10 | 
11 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-master:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # run this as:
 4 | #   sudo docker run -i -t -d shark-master:$SHARK_VERSION
 5 | 
 6 | source /root/shark_files/configure_shark.sh
 7 | 
 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 9 | echo "MASTER_IP=$IP"
10 | 
11 | echo "preparing Shark"
12 | prepare_shark $IP
13 | 
14 | echo "starting Hadoop namenode"
15 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
16 | service hadoop-namenode start > /dev/null 2>&1
17 | 
18 | echo "starting sshd"
19 | /usr/sbin/sshd
20 | 
21 | sleep 5
22 | 
23 | echo "starting Shark master"
24 | cp /root/shark_master_files/run_shark_master.sh /
25 | chmod a+rx /run_shark_master.sh
26 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION SHARK_VERSION=$SHARK_VERSION /run_shark_master.sh $IP
27 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-master/files/run_shark_master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . /opt/shark-0.8.0/conf/shark-env.sh
 3 | export PATH=$PATH:$SCALA_HOME/bin
 4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar
 5 | 
 6 | /opt/spark-0.8.0/bin/start-master.sh
 7 | 
 8 | while [ 1 ];
 9 | do
10 |     tail -f /opt/spark-${SPARK_VERSION}/logs/*.out
11 |     sleep 1
12 | done
13 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark master
 2 | FROM shark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_shell_files
 7 | 
 8 | # Add default command for master
 9 | ENTRYPOINT ["/root/shark_shell_files/default_cmd"]
10 | 
11 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-shell:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/shark_files/configure_shark.sh
 4 | prepare_shark $1
 5 | env
 6 | 
 7 | # Note: there are issues if the nameserver did not have time to
 8 | # refresh its cache with this shell's hostname so give him time
 9 | # to do so.
10 | sleep 3
11 | 
12 | cd /opt/metastore && sudo -u hdfs /opt/shark-$SHARK_VERSION/bin/shark
13 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-shell/files/test.shark:
--------------------------------------------------------------------------------
1 | CREATE TABLE src(key INT, value STRING);
2 | LOAD DATA LOCAL INPATH '${env:HIVE_HOME}/examples/files/kv1.txt' INTO TABLE src;
3 | SELECT COUNT(1) FROM src;
4 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Shark worker
 2 | FROM shark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Add run script
 6 | ADD files /root/shark_worker_files
 7 | 
 8 | # Add the entrypoint script for the worker
 9 | ENTRYPOINT ["/root/shark_worker_files/default_cmd"]
10 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}shark-worker:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # run this as:
 4 | #    sudo docker run -d shark-worker:${SHARK_VERSION} ${MASTER_IP} spark://${MASTER_IP}:7077
 5 | 
 6 | source /root/shark_files/configure_shark.sh
 7 | 
 8 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 9 | echo "WORKER_IP=$IP"
10 | 
11 | echo "preparing Shark"
12 | prepare_shark $1
13 | 
14 | echo "starting Hadoop datanode"
15 | service hadoop-datanode start
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Shark worker node"
23 | cp /root/shark_worker_files/run_shark_worker.sh /
24 | chmod a+rx /run_shark_worker.sh
25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION SHARK_VERSION=$SHARK_VERSION /run_shark_worker.sh
26 | 


--------------------------------------------------------------------------------
/shark-0.8.0/shark-worker/files/run_shark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/shark-0.8.0/conf/shark-env.sh
3 | export PATH=$PATH:$SCALA_HOME/bin
4 | export CLASSPATH=$CLASSPATH:$SCALA_HOME/lib/scala-library.jar
5 | /opt/spark-0.8.0/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
6 | 


--------------------------------------------------------------------------------
/spark-0.7.3/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spark_dirs=$(ls -d spark*)
 4 | dir_list=("$spark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Hadoop 1.2.1
 2 | # Version 1.2.1
 3 | #
 4 | FROM apache-hadoop-hdfs-precise:1.2.1
 5 | 
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | ENV SCALA_VERSION 2.9.3
 9 | ENV SPARK_VERSION 0.7.3
10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION
11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION
12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH
13 | 
14 | # Install Scala
15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz /
16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /scala-$SCALA_VERSION.tgz && chown -R hdfs.hdfs /opt/scala-$SCALA_VERSION
18 | 
19 | # Install Spark 
20 | ADD http://spark-project.org/download/spark-$SPARK_VERSION-prebuilt-hadoop1.tgz /
21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-prebuilt-hadoop1.tgz)|(cd /opt && tar -xvf -)
22 | RUN rm /spark-$SPARK_VERSION-prebuilt-hadoop1.tgz
23 | 
24 | # Add Spark config files and configure script
25 | ADD files /root/spark_files
26 | 
27 | #RUN cp /root/spark_files/spark-0.7.3_precomp_hadoop1.tar.gz /
28 | #RUN (cd / && gunzip < spark-0.7.3_precomp_hadoop1.tar.gz)|(cd /opt && tar -xvf -)
29 | #RUN rm /spark-0.7.3_precomp_hadoop1.tar.gz
30 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.7.3 .
5 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-base/files/configure_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/hadoop_files/configure_hadoop.sh
 4 | 
 5 | function create_spark_directories() {
 6 |     create_hadoop_directories
 7 |     rm -rf /opt/spark-$SPARK_VERSION/work
 8 |     mkdir -p /opt/spark-$SPARK_VERSION/work
 9 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work
10 |     mkdir /tmp/spark
11 |     chown hdfs.hdfs /tmp/spark
12 |     # this one is for Spark shell logging
13 |     rm -rf /var/lib/hadoop/hdfs
14 |     mkdir -p /var/lib/hadoop/hdfs
15 |     chown hdfs.hdfs /var/lib/hadoop/hdfs
16 | }
17 | 
18 | function deploy_spark_files() {
19 |     deploy_hadoop_files
20 |     cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/
21 |     cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/
22 | }		
23 | 
24 | function configure_spark() {
25 |     configure_hadoop $1
26 |     #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
27 |     #sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
28 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
29 |     sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
30 | }
31 | 
32 | function prepare_spark() {
33 |     create_spark_directories
34 |     deploy_spark_files
35 |     configure_spark $1
36 | }
37 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-base/files/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 | 
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-base/files/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export SCALA_HOME=/opt/scala-2.9.3
 3 | export SPARK_HOME=__SPARK_HOME__
 4 | export SPARK_WORKER_CORES=1
 5 | export SPARK_MEM=800m
 6 | export SPARK_WORKER_MEMORY=1500m
 7 | export SPARK_MASTER_MEM=1500m
 8 | export SPARK_WORKER_CORES=1
 9 | export HADOOP_HOME="/etc/hadoop"
10 | export MASTER="spark://master:7077"
11 | export SPARK_LOCAL_DIR=/tmp/spark
12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark "
13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
16 | #export SPARK_JAVA_OPTS
17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
18 | #export SPARK_DAEMON_JAVA_OPTS
19 | export JAVA_HOME=__JAVA_HOME__
20 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-master/Dockerfile:
--------------------------------------------------------------------------------
1 | # Spark 0.7.3
2 | # Version 0.7.3
3 | FROM spark-base:0.7.3
4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
5 | 
6 | ADD files /root/spark_master_files
7 | 
8 | CMD ["/root/spark_master_files/default_cmd"]
9 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.7.3 .
5 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/spark_files/configure_spark.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Spark"
11 | prepare_spark $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
15 | service hadoop-namenode start > /dev/null 2>&1
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | # note: it seems important to sleep here
21 | sleep 5
22 | 
23 | echo "starting Spark Master"
24 | cp /root/spark_master_files/run_spark_master.sh /
25 | chmod a+rx /run_spark_master.sh
26 | sudo -u hdfs /run_spark_master.sh
27 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-master/files/run_spark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-0.7.3/conf/spark-env.sh
3 | /opt/spark-0.7.3/run spark.deploy.master.Master -i master
4 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.7.3
 2 | # Version 0.7.3
 3 | FROM spark-base:0.7.3
 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 5 | 
 6 | VOLUME [ "/etc/dnsmasq.d" ]
 7 | 
 8 | ADD files /root/spark_shell_files
 9 | 
10 | # Add the entrypoint script for the master
11 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
12 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.7.3 .
5 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Spark"
 8 | prepare_spark "master"
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop dfsadmin -safemode wait
13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt
14 | 
15 | # Note: there are issues if the nameserver did not have time to
16 | # refresh its cache with this shell's hostname so give him time
17 | # to do so.
18 | sleep 3
19 | 
20 | echo "starting Spark Shell"
21 | cd $SPARK_HOME
22 | sudo -u hdfs HDFS_PREFIX=hdfs://master:9000 ./spark-shell
23 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-shell/files/test.spark:
--------------------------------------------------------------------------------
1 | val hdfs_prefix = System.getenv("HDFS_PREFIX")
2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt")
3 | textFile.count()
4 | textFile.map({line => line}).collect()
5 | exit
6 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.7.3
 2 | # Version 0.7.3
 3 | FROM spark-base:0.7.3
 4 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 5 | 
 6 | ADD files /root/spark_worker_files
 7 | 
 8 | # Add the entrypoint script for the master
 9 | CMD ["-h"]
10 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.7.3 .
5 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Spark"
 9 | prepare_spark $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Spark Worker"
20 | cp /root/spark_worker_files/run_spark_worker.sh /
21 | chmod a+rx /run_spark_worker.sh
22 | sudo -u hdfs /run_spark_worker.sh
23 | 


--------------------------------------------------------------------------------
/spark-0.7.3/spark-worker/files/run_spark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-0.7.3/conf/spark-env.sh
3 | /opt/spark-0.7.3/run spark.deploy.worker.Worker spark://master:7077
4 | 


--------------------------------------------------------------------------------
/spark-0.8.0/NOTE.txt:
--------------------------------------------------------------------------------
1 | Many of the files here are in fact identical to the ones in the
2 | Spark 0.7.3 directory. However, since Docker does not follow
3 | symbolic links when it builds images we need the duplication.
4 | 


--------------------------------------------------------------------------------
/spark-0.8.0/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spark_dirs=$(ls -d spark*)
 4 | dir_list=("$spark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.8.0
 2 | # Version 0.8.0
 3 | #
 4 | FROM apache-hadoop-hdfs-precise:1.2.1
 5 | 
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | ENV SCALA_VERSION 2.9.3
 9 | ENV SPARK_VERSION 0.8.0
10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION
11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION
12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH
13 | 
14 | # Install Scala
15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz /
16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /scala-$SCALA_VERSION.tgz
18 | 
19 | # Install Spark 
20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz /
21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)|(cd /opt && tar -xvf -)
22 | RUN (ln -s /opt/spark-$SPARK_VERSION-incubating-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)
23 | 
24 | # Add Shark config files and configure script
25 | ADD files /root/spark_files
26 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-base/files/configure_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/hadoop_files/configure_hadoop.sh
 4 | 
 5 | function create_spark_directories() {
 6 |     create_hadoop_directories
 7 |     rm -rf /opt/spark-$SPARK_VERSION/work
 8 |     mkdir -p /opt/spark-$SPARK_VERSION/work
 9 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work
10 |     mkdir /tmp/spark
11 |     chown hdfs.hdfs /tmp/spark
12 |     # this one is for Spark shell logging
13 |     rm -rf /var/lib/hadoop/hdfs
14 |     mkdir -p /var/lib/hadoop/hdfs
15 |     chown hdfs.hdfs /var/lib/hadoop/hdfs
16 |     rm -rf /opt/spark-$SPARK_VERSION/logs
17 |     mkdir -p /opt/spark-$SPARK_VERSION/logs
18 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs
19 | }
20 | 
21 | function deploy_spark_files() {
22 |     deploy_hadoop_files
23 |     cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/
24 |     cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/
25 | }		
26 | 
27 | function configure_spark() {
28 |     configure_hadoop $1
29 |     #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
30 |     sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
31 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
32 |     sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
33 | }
34 | 
35 | function prepare_spark() {
36 |     create_spark_directories
37 |     deploy_spark_files
38 |     configure_spark $1
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-base/files/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 | 
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-base/files/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export SCALA_HOME=/opt/scala-2.9.3
 3 | export SPARK_HOME=__SPARK_HOME__
 4 | export SPARK_WORKER_CORES=1
 5 | export SPARK_MEM=800m
 6 | export SPARK_WORKER_MEMORY=1500m
 7 | export SPARK_MASTER_MEM=1500m
 8 | export SPARK_MASTER_IP=__MASTER__
 9 | export HADOOP_HOME="/etc/hadoop"
10 | export MASTER="spark://__MASTER__:7077"
11 | export SPARK_LOCAL_DIR=/tmp/spark
12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark "
13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
16 | #export SPARK_JAVA_OPTS
17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
18 | #export SPARK_DAEMON_JAVA_OPTS
19 | export JAVA_HOME=__JAVA_HOME__
20 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Expose TCP ports 7077 8080
 6 | EXPOSE 7077 8080
 7 | 
 8 | ADD files /root/spark_master_files
 9 | 
10 | CMD ["/root/spark_master_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/spark_files/configure_spark.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Spark"
11 | prepare_spark $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
15 | service hadoop-namenode start > /dev/null 2>&1
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Spark Master"
23 | cp /root/spark_master_files/run_spark_master.sh /
24 | chmod a+rx /run_spark_master.sh
25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh
26 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-master/files/run_spark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | /opt/spark-0.8.0/bin/start-master.sh
3 | 
4 | while [ 1 ];
5 | do
6 | 	tail -f /opt/spark-${SPARK_VERSION}/logs/*.out
7 |         sleep 1
8 | done
9 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_shell_files
10 | 
11 | # Add the entrypoint script for the master
12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
13 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Spark"
 8 | prepare_spark "master"
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop dfsadmin -safemode wait
13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt
14 | 
15 | cp /root/spark_shell_files/test.spark /
16 | 
17 | # Note: there are issues if the nameserver did not have time to
18 | # refresh its cache with this shell's hostname so give him time
19 | # to do so.
20 | sleep 3
21 | 
22 | echo "starting Spark Shell"
23 | 
24 | cd $SPARK_HOME
25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./spark-shell
26 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-shell/files/test.spark:
--------------------------------------------------------------------------------
1 | val hdfs_prefix = System.getenv("HDFS_PREFIX")
2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt")
3 | textFile.count()
4 | textFile.map({line => line}).collect()
5 | exit
6 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.8.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_worker_files
10 | 
11 | # Add the entrypoint script for the master
12 | CMD ["-h"]
13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"]
14 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.8.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Spark"
 9 | prepare_spark $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Spark Worker"
20 | cp /root/spark_worker_files/run_spark_worker.sh /
21 | chmod a+rx /run_spark_worker.sh
22 | sudo -u hdfs /run_spark_worker.sh
23 | 


--------------------------------------------------------------------------------
/spark-0.8.0/spark-worker/files/run_spark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-0.8.0/conf/spark-env.sh
3 | ${SPARK_HOME}/spark-class org.apache.spark.deploy.worker.Worker $MASTER
4 | 


--------------------------------------------------------------------------------
/spark-0.9.0/NOTE.txt:
--------------------------------------------------------------------------------
1 | Many of the files here are in fact identical to the ones in the
2 | Spark 0.9.0 directory. However, since Docker does not follow
3 | symbolic links when it builds images we need the duplication.
4 | 


--------------------------------------------------------------------------------
/spark-0.9.0/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spark_dirs=$(ls -d spark*)
 4 | dir_list=("$spark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.9.0
 2 | # Version 0.9.0
 3 | #
 4 | FROM apache-hadoop-hdfs-precise:1.2.1
 5 | 
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | ENV SCALA_VERSION 2.10.3
 9 | ENV SPARK_VERSION 0.9.0
10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION
11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION
12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH
13 | 
14 | # Install Scala
15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz /
16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /scala-$SCALA_VERSION.tgz
18 | 
19 | # Install Spark 
20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz /
21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)|(cd /opt && tar -xvf -)
22 | RUN (ln -s /opt/spark-$SPARK_VERSION-incubating-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz)
23 | 
24 | # Add Shark config files and configure script
25 | ADD files /root/spark_files
26 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.9.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-base/files/configure_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/hadoop_files/configure_hadoop.sh
 4 | 
 5 | function create_spark_directories() {
 6 |     create_hadoop_directories
 7 |     rm -rf /opt/spark-$SPARK_VERSION/work
 8 |     mkdir -p /opt/spark-$SPARK_VERSION/work
 9 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work
10 |     mkdir /tmp/spark
11 |     chown hdfs.hdfs /tmp/spark
12 |     # this one is for Spark shell logging
13 |     rm -rf /var/lib/hadoop/hdfs
14 |     mkdir -p /var/lib/hadoop/hdfs
15 |     chown hdfs.hdfs /var/lib/hadoop/hdfs
16 |     rm -rf /opt/spark-$SPARK_VERSION/logs
17 |     mkdir -p /opt/spark-$SPARK_VERSION/logs
18 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs
19 | }
20 | 
21 | function deploy_spark_files() {
22 |     deploy_hadoop_files
23 |     cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/
24 |     cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/
25 | }		
26 | 
27 | function configure_spark() {
28 |     configure_hadoop $1
29 |     #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
30 |     sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
31 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
32 |     sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
33 | }
34 | 
35 | function prepare_spark() {
36 |     create_spark_directories
37 |     deploy_spark_files
38 |     configure_spark $1
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-base/files/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 | 
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-base/files/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export SCALA_HOME=/opt/scala-2.10.3
 3 | export SPARK_HOME=__SPARK_HOME__
 4 | export SPARK_WORKER_CORES=1
 5 | export SPARK_MEM=800m
 6 | export SPARK_WORKER_MEMORY=1500m
 7 | export SPARK_MASTER_MEM=1500m
 8 | export SPARK_MASTER_IP=__MASTER__
 9 | export HADOOP_HOME="/etc/hadoop"
10 | export MASTER="spark://__MASTER__:7077"
11 | export SPARK_LOCAL_DIR=/tmp/spark
12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark "
13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
16 | #export SPARK_JAVA_OPTS
17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
18 | #export SPARK_DAEMON_JAVA_OPTS
19 | export JAVA_HOME=__JAVA_HOME__
20 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Expose TCP ports 7077 8080
 6 | EXPOSE 7077 8080
 7 | 
 8 | ADD files /root/spark_master_files
 9 | 
10 | CMD ["/root/spark_master_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.9.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/spark_files/configure_spark.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Spark"
11 | prepare_spark $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
15 | service hadoop-namenode start > /dev/null 2>&1
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Spark Master"
23 | cp /root/spark_master_files/run_spark_master.sh /
24 | chmod a+rx /run_spark_master.sh
25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh
26 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-master/files/run_spark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | /opt/spark-0.9.0/sbin/start-master.sh
3 | 
4 | while [ 1 ];
5 | do
6 | 	tail -f /opt/spark-${SPARK_VERSION}/logs/*.out
7 |         sleep 1
8 | done
9 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_shell_files
10 | 
11 | # Add the entrypoint script for the master
12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
13 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.9.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Spark"
 8 | prepare_spark "master"
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop dfsadmin -safemode wait
13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt
14 | 
15 | cp /root/spark_shell_files/test.spark /
16 | 
17 | # Note: there are issues if the nameserver did not have time to
18 | # refresh its cache with this shell's hostname so give him time
19 | # to do so.
20 | sleep 3
21 | 
22 | echo "starting Spark Shell"
23 | 
24 | cd $SPARK_HOME
25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell
26 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-shell/files/test.spark:
--------------------------------------------------------------------------------
1 | val hdfs_prefix = System.getenv("HDFS_PREFIX")
2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt")
3 | textFile.count()
4 | textFile.map({line => line}).collect()
5 | exit
6 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_worker_files
10 | 
11 | # Add the entrypoint script for the master
12 | CMD ["-h"]
13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"]
14 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.9.0 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Spark"
 9 | prepare_spark $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Spark Worker"
20 | cp /root/spark_worker_files/run_spark_worker.sh /
21 | chmod a+rx /run_spark_worker.sh
22 | sudo -u hdfs /run_spark_worker.sh
23 | 


--------------------------------------------------------------------------------
/spark-0.9.0/spark-worker/files/run_spark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-0.9.0/conf/spark-env.sh
3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER
4 | 


--------------------------------------------------------------------------------
/spark-0.9.1/NOTE.txt:
--------------------------------------------------------------------------------
1 | Many of the files here are in fact identical to the ones in the
2 | Spark 0.9.1 directory. However, since Docker does not follow
3 | symbolic links when it builds images we need the duplication.
4 | 


--------------------------------------------------------------------------------
/spark-0.9.1/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spark_dirs=$(ls -d spark*)
 4 | dir_list=("$spark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 0.9.1
 2 | # Version 0.9.1
 3 | #
 4 | FROM apache-hadoop-hdfs-precise:1.2.1
 5 | 
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | ENV SCALA_VERSION 2.10.3
 9 | ENV SPARK_VERSION 0.9.1
10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION
11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION
12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH
13 | 
14 | # Install Scala
15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz /
16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /scala-$SCALA_VERSION.tgz
18 | 
19 | # Install Spark 
20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop1.tgz /
21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-bin-hadoop1.tgz)|(cd /opt && tar -xvf -)
22 | RUN (ln -s /opt/spark-$SPARK_VERSION-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-bin-hadoop1.tgz)
23 | 
24 | # Add Shark config files and configure script
25 | ADD files /root/spark_files
26 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:0.9.1 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-base/files/configure_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/hadoop_files/configure_hadoop.sh
 4 | 
 5 | function create_spark_directories() {
 6 |     create_hadoop_directories
 7 |     rm -rf /opt/spark-$SPARK_VERSION/work
 8 |     mkdir -p /opt/spark-$SPARK_VERSION/work
 9 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work
10 |     mkdir /tmp/spark
11 |     chown hdfs.hdfs /tmp/spark
12 |     # this one is for Spark shell logging
13 |     rm -rf /var/lib/hadoop/hdfs
14 |     mkdir -p /var/lib/hadoop/hdfs
15 |     chown hdfs.hdfs /var/lib/hadoop/hdfs
16 |     rm -rf /opt/spark-$SPARK_VERSION/logs
17 |     mkdir -p /opt/spark-$SPARK_VERSION/logs
18 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs
19 | }
20 | 
21 | function deploy_spark_files() {
22 |     deploy_hadoop_files
23 |     cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/
24 |     cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/
25 | }		
26 | 
27 | function configure_spark() {
28 |     configure_hadoop $1
29 |     #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
30 |     sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
31 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
32 |     sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
33 | }
34 | 
35 | function prepare_spark() {
36 |     create_spark_directories
37 |     deploy_spark_files
38 |     configure_spark $1
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-base/files/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 | 
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-base/files/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export SCALA_HOME=/opt/scala-2.10.3
 3 | export SPARK_HOME=__SPARK_HOME__
 4 | export SPARK_WORKER_CORES=1
 5 | export SPARK_MEM=800m
 6 | export SPARK_WORKER_MEMORY=1500m
 7 | export SPARK_MASTER_MEM=1500m
 8 | export SPARK_MASTER_IP=__MASTER__
 9 | export HADOOP_HOME="/etc/hadoop"
10 | export MASTER="spark://__MASTER__:7077"
11 | export SPARK_LOCAL_DIR=/tmp/spark
12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark "
13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
16 | #export SPARK_JAVA_OPTS
17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
18 | #export SPARK_DAEMON_JAVA_OPTS
19 | export JAVA_HOME=__JAVA_HOME__
20 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.1
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Expose TCP ports 7077 8080
 6 | EXPOSE 7077 8080
 7 | 
 8 | ADD files /root/spark_master_files
 9 | 
10 | CMD ["/root/spark_master_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:0.9.1 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/spark_files/configure_spark.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Spark"
11 | prepare_spark $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
15 | service hadoop-namenode start > /dev/null 2>&1
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Spark Master"
23 | cp /root/spark_master_files/run_spark_master.sh /
24 | chmod a+rx /run_spark_master.sh
25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh
26 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-master/files/run_spark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | /opt/spark-0.9.1/sbin/start-master.sh
3 | 
4 | while [ 1 ];
5 | do
6 | 	tail -f /opt/spark-${SPARK_VERSION}/logs/*.out
7 |         sleep 1
8 | done
9 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.1
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_shell_files
10 | 
11 | # Add the entrypoint script for the master
12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
13 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:0.9.1 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Spark"
 8 | prepare_spark "master"
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop dfsadmin -safemode wait
13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt
14 | 
15 | cp /root/spark_shell_files/test.spark /
16 | 
17 | # Note: there are issues if the nameserver did not have time to
18 | # refresh its cache with this shell's hostname so give him time
19 | # to do so.
20 | sleep 3
21 | 
22 | echo "starting Spark Shell"
23 | 
24 | cd $SPARK_HOME
25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell
26 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-shell/files/test.spark:
--------------------------------------------------------------------------------
1 | val hdfs_prefix = System.getenv("HDFS_PREFIX")
2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt")
3 | textFile.count()
4 | textFile.map({line => line}).collect()
5 | exit
6 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:0.9.1
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_worker_files
10 | 
11 | # Add the entrypoint script for the master
12 | CMD ["-h"]
13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"]
14 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:0.9.1 .
5 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Spark"
 9 | prepare_spark $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Spark Worker"
20 | cp /root/spark_worker_files/run_spark_worker.sh /
21 | chmod a+rx /run_spark_worker.sh
22 | sudo -u hdfs /run_spark_worker.sh
23 | 


--------------------------------------------------------------------------------
/spark-0.9.1/spark-worker/files/run_spark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-0.9.1/conf/spark-env.sh
3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER
4 | 


--------------------------------------------------------------------------------
/spark-1.0.0/NOTE.txt:
--------------------------------------------------------------------------------
1 | Many of the files here are in fact identical to the ones in the
2 | Spark 1.0.0 directory. However, since Docker does not follow
3 | symbolic links when it builds images we need the duplication.
4 | 


--------------------------------------------------------------------------------
/spark-1.0.0/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spark_dirs=$(ls -d spark*)
 4 | dir_list=("$spark_dirs")
 5 | 
 6 | # NOTE: the order matters but this is the right one
 7 | for i in ${dir_list[@]}; do
 8 | 	echo building $i;
 9 | 	cd $i;
10 | 	cat build;
11 | 	. build;
12 | 	cd ..;
13 | done
14 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark 1.0.0
 2 | # Version 1.0.0
 3 | #
 4 | FROM apache-hadoop-hdfs-precise:1.2.1
 5 | 
 6 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 7 | 
 8 | ENV SCALA_VERSION 2.10.3
 9 | ENV SPARK_VERSION 1.0.0
10 | ENV SCALA_HOME /opt/scala-$SCALA_VERSION
11 | ENV SPARK_HOME /opt/spark-$SPARK_VERSION
12 | ENV PATH $SPARK_HOME:$SCALA_HOME/bin:$PATH
13 | 
14 | # Install Scala
15 | ADD http://www.scala-lang.org/files/archive/scala-$SCALA_VERSION.tgz /
16 | RUN (cd / && gunzip < scala-$SCALA_VERSION.tgz)|(cd /opt && tar -xvf -)
17 | RUN rm /scala-$SCALA_VERSION.tgz
18 | 
19 | # Install Spark 
20 | ADD http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop1.tgz /
21 | RUN (cd / && gunzip < spark-$SPARK_VERSION-bin-hadoop1.tgz)|(cd /opt && tar -xvf -)
22 | RUN (ln -s /opt/spark-$SPARK_VERSION-bin-hadoop1 /opt/spark-$SPARK_VERSION && rm /spark-$SPARK_VERSION-bin-hadoop1.tgz)
23 | 
24 | # Add Shark config files and configure script
25 | ADD files /root/spark_files
26 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-base/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-base:1.0.0 .
5 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-base/files/configure_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/hadoop_files/configure_hadoop.sh
 4 | 
 5 | function create_spark_directories() {
 6 |     create_hadoop_directories
 7 |     rm -rf /opt/spark-$SPARK_VERSION/work
 8 |     mkdir -p /opt/spark-$SPARK_VERSION/work
 9 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/work
10 |     mkdir /tmp/spark
11 |     chown hdfs.hdfs /tmp/spark
12 |     # this one is for Spark shell logging
13 |     rm -rf /var/lib/hadoop/hdfs
14 |     mkdir -p /var/lib/hadoop/hdfs
15 |     chown hdfs.hdfs /var/lib/hadoop/hdfs
16 |     rm -rf /opt/spark-$SPARK_VERSION/logs
17 |     mkdir -p /opt/spark-$SPARK_VERSION/logs
18 |     chown hdfs.hdfs /opt/spark-$SPARK_VERSION/logs
19 | }
20 | 
21 | function deploy_spark_files() {
22 |     deploy_hadoop_files
23 |     cp /root/spark_files/spark-env.sh /opt/spark-$SPARK_VERSION/conf/
24 |     cp /root/spark_files/log4j.properties /opt/spark-$SPARK_VERSION/conf/
25 | }		
26 | 
27 | function configure_spark() {
28 |     configure_hadoop $1
29 |     #sed -i s/__MASTER__/$1/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
30 |     sed -i s/__MASTER__/master/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
31 |     sed -i s/__SPARK_HOME__/"\/opt\/spark-${SPARK_VERSION}"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
32 |     sed -i s/__JAVA_HOME__/"\/usr\/lib\/jvm\/java-7-openjdk-amd64"/ /opt/spark-$SPARK_VERSION/conf/spark-env.sh
33 | }
34 | 
35 | function prepare_spark() {
36 |     create_spark_directories
37 |     deploy_spark_files
38 |     configure_spark $1
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-base/files/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 | 
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-base/files/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export SCALA_HOME=/opt/scala-2.10.3
 3 | export SPARK_HOME=__SPARK_HOME__
 4 | export SPARK_WORKER_CORES=1
 5 | export SPARK_MEM=800m
 6 | export SPARK_WORKER_MEMORY=1500m
 7 | export SPARK_MASTER_MEM=1500m
 8 | export SPARK_MASTER_IP=__MASTER__
 9 | export HADOOP_HOME="/etc/hadoop"
10 | export MASTER="spark://__MASTER__:7077"
11 | export SPARK_LOCAL_DIR=/tmp/spark
12 | #SPARK_JAVA_OPTS="-Dspark.local.dir=/tmp/spark "
13 | #SPARK_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
14 | #SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
15 | #SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
16 | #export SPARK_JAVA_OPTS
17 | #SPARK_DAEMON_JAVA_OPTS+=" -Dspark.akka.logLifecycleEvents=true "
18 | #export SPARK_DAEMON_JAVA_OPTS
19 | export JAVA_HOME=__JAVA_HOME__
20 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:1.0.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Expose TCP ports 7077 8080
 6 | EXPOSE 7077 8080
 7 | 
 8 | ADD files /root/spark_master_files
 9 | 
10 | CMD ["/root/spark_master_files/default_cmd"]
11 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-master/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-master:1.0.0 .
5 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-master/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | env
 4 | 
 5 | source /root/spark_files/configure_spark.sh
 6 | 
 7 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 8 | echo "MASTER_IP=$IP"
 9 | 
10 | echo "preparing Spark"
11 | prepare_spark $IP
12 | 
13 | echo "starting Hadoop Namenode"
14 | sudo -u hdfs hadoop namenode -format > /dev/null 2>&1
15 | service hadoop-namenode start > /dev/null 2>&1
16 | 
17 | echo "starting sshd"
18 | /usr/sbin/sshd
19 | 
20 | sleep 5
21 | 
22 | echo "starting Spark Master"
23 | cp /root/spark_master_files/run_spark_master.sh /
24 | chmod a+rx /run_spark_master.sh
25 | sudo -u hdfs SPARK_VERSION=$SPARK_VERSION /run_spark_master.sh
26 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-master/files/run_spark_master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | /opt/spark-1.0.0/sbin/start-master.sh
3 | 
4 | while [ 1 ];
5 | do
6 | 	tail -f /opt/spark-${SPARK_VERSION}/logs/*.out
7 |         sleep 1
8 | done
9 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-shell/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:1.0.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_shell_files
10 | 
11 | # Add the entrypoint script for the master
12 | ENTRYPOINT ["/root/spark_shell_files/default_cmd"]
13 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-shell/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-shell:1.0.0 .
5 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-shell/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | env
 6 | 
 7 | echo "preparing Spark"
 8 | prepare_spark "master"
 9 | 
10 | echo "adding test data to HDFS"
11 | cp /root/spark_shell_files/test.txt /tmp
12 | sudo -u hdfs hadoop dfsadmin -safemode wait
13 | sudo -u hdfs hadoop fs -put /tmp/test.txt hdfs://master:9000/user/hdfs/test.txt
14 | 
15 | cp /root/spark_shell_files/test.spark /
16 | 
17 | # Note: there are issues if the nameserver did not have time to
18 | # refresh its cache with this shell's hostname so give him time
19 | # to do so.
20 | sleep 3
21 | 
22 | echo "starting Spark Shell"
23 | 
24 | cd $SPARK_HOME
25 | sudo -u hdfs MASTER=spark://master:7077 HDFS_PREFIX=hdfs://master:9000 ./bin/spark-shell
26 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-shell/files/test.spark:
--------------------------------------------------------------------------------
1 | val hdfs_prefix = System.getenv("HDFS_PREFIX")
2 | val textFile = sc.textFile(hdfs_prefix+"/user/hdfs/test.txt")
3 | textFile.count()
4 | textFile.map({line => line}).collect()
5 | exit
6 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-shell/files/test.txt:
--------------------------------------------------------------------------------
1 | this is a test
2 | more test
3 | one more line
4 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | FROM spark-base:1.0.0
 3 | MAINTAINER amplab amp-docker@eecs.berkeley.edu
 4 | 
 5 | # Instead of using a random port, bind the worker to a specific port
 6 | ENV SPARK_WORKER_PORT 8888
 7 | EXPOSE 8888
 8 | 
 9 | ADD files /root/spark_worker_files
10 | 
11 | # Add the entrypoint script for the master
12 | CMD ["-h"]
13 | ENTRYPOINT ["/root/spark_worker_files/default_cmd"]
14 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-worker/build:
--------------------------------------------------------------------------------
1 | rm -f files/files.hash
2 | for i in `find . -type f | sed s/"\.\/"//`; do git hash-object $i | tr -d '\n'; echo -e "\t$i"; done > /tmp/files.hash
3 | mv /tmp/files.hash files/files.hash
4 | sudo docker build -t ${IMAGE_PREFIX}spark-worker:1.0.0 .
5 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-worker/files/default_cmd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /root/spark_files/configure_spark.sh
 4 | 
 5 | IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')
 6 | echo "WORKER_IP=$IP"
 7 | 
 8 | echo "preparing Spark"
 9 | prepare_spark $1
10 | 
11 | echo "starting Hadoop Datanode"
12 | service hadoop-datanode start
13 | 
14 | echo "starting sshd"
15 | /usr/sbin/sshd
16 | 
17 | sleep 5
18 | 
19 | echo "starting Spark Worker"
20 | cp /root/spark_worker_files/run_spark_worker.sh /
21 | chmod a+rx /run_spark_worker.sh
22 | sudo -u hdfs /run_spark_worker.sh
23 | 


--------------------------------------------------------------------------------
/spark-1.0.0/spark-worker/files/run_spark_worker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/spark-1.0.0/conf/spark-env.sh
3 | ${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker $MASTER
4 | 


--------------------------------------------------------------------------------
/test/test_all.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [[ "$USER" != "root" ]]; then
  4 |     echo "please run as: sudo $0"
  5 |     exit 1
  6 | fi
  7 | 
  8 | BASEDIR=$(cd $(dirname $0); pwd)"/.."
  9 | service_list=("spark:0.9.0" "shark:0.8.0" "spark:0.8.0" "spark:0.7.3" "shark:0.7.0" )
 10 | 
 11 | IMAGE_PREFIX=""
 12 | #"amplab/"
 13 | 
 14 | START=$(date)
 15 | echo "starting tests at $START" > tests.log
 16 | 
 17 | RESULT=0
 18 | FAILED=0
 19 | 
 20 | check_screen_session_alive() {
 21 |     screen -q -ls > /dev/null
 22 |     if (( $? < 10 )); then
 23 |         SCREEN_ALIVE=1
 24 |     fi 
 25 | }
 26 | 
 27 | function wait_for_prompt() {
 28 |     service=$1
 29 |     OUTFILE=$2
 30 |     SCREEN_ALIVE=0
 31 |     
 32 |     if [[ "$service" == "spark" ]]; then
 33 |         query_string="scala>\s$"
 34 |     else
 35 |         query_string="^shark>\s$\|\s\s\s\s\s>\s$"
 36 |     fi
 37 |     
 38 |     tail -n 1 $OUTFILE | tr -d $'\r' | grep "$query_string" > /dev/null
 39 |     STOP="$?"
 40 |     until [[ "$STOP" == "0" ]]; do
 41 |         sleep 1
 42 |         check_screen_session_alive
 43 |         if [[ "$SCREEN_ALIVE" == "0" ]]; then
 44 |             sudo screen -S tmpshell -p 0 -X stuff $'\n'
 45 |             tail -n 1 $OUTFILE | tr -d $'\r' | grep "$query_string" > /dev/null
 46 |             STOP="$?"
 47 |         else
 48 |             break
 49 |         fi
 50 |     done
 51 | }
 52 | 
 53 | function check_result() {
 54 |     service=$1
 55 |     outfile=$2
 56 | 
 57 |     if [[ "$service" == "spark" ]]; then
 58 |         grep "Array(this is a test, more test, one more line)" $outfile > /dev/null
 59 |         RESULT="$?"
 60 |     elif [[ "$service" == "shark" ]]; then
 61 |         cat $outfile | tr -d $'\r' | grep "^500$" > /dev/null
 62 |         RESULT="$?"
 63 |     fi
 64 | }
 65 | 
 66 | # NOTE: the order matters but this is the right one
 67 | for i in ${service_list[@]}; do
 68 |     service=$(echo $i | awk -F ":" '{print $1}')
 69 |     version=$(echo $i | awk -F ":" '{print $2}')
 70 |     dirname=${service}-${version}
 71 |     LOGFILE=${BASEDIR}/test/${dirname}.log
 72 |     OUTFILE=${BASEDIR}/test/${dirname}.out
 73 |     rm -f "$LOGFILE" "$OUTFILE"
 74 |     START=$(date)
 75 |     echo "starting tests at $START" > $LOGFILE
 76 |     $BASEDIR/deploy/deploy.sh -i ${IMAGE_PREFIX}${i} 1>>$LOGFILE 2>&1
 77 |     NAMESERVER_IP=$(grep NAMESERVER_IP ${dirname}.log | awk '{print $2}')
 78 |     MASTER_IP=$(grep MASTER_IP ${dirname}.log | awk '{print $2}')
 79 | 
 80 |     # we need this to set screen's output logfile
 81 |     cat << EOF >/tmp/screenrc
 82 | logfile $OUTFILE
 83 | EOF
 84 |     cat > cmd.sh <<EOF
 85 | #!/bin/bash
 86 | sudo docker run -i -dns $NAMESERVER_IP ${service}-shell:${version} $MASTER_IP
 87 | EOF
 88 |     sleep 3
 89 |     chmod u+x cmd.sh
 90 |     sudo screen -c /tmp/screenrc -L -S tmpshell -d -m -s ./cmd.sh
 91 | 
 92 |     sleep 5
 93 |     wait_for_prompt $service $OUTFILE
 94 | 
 95 |     TESTDATA=$(cat ${BASEDIR}/${dirname}/${service}-shell/files/test.${service})
 96 |     sudo screen -S tmpshell -p 0 -X stuff "$TESTDATA"
 97 |     sudo screen -S tmpshell -p 0 -X stuff $'\n'
 98 | 
 99 |     sleep 10
100 |     wait_for_prompt $service $OUTFILE
101 |     # the shell session should be in shoutdown already since we
102 |     # always include an exit command; however, because of timing
103 |     # issues it may take a while and conflict with the following
104 |     # test. So let's wait one second and then kill the screen
105 |     # session from the outside
106 |     sleep 1
107 |     sudo screen -S tmpshell -p 0 -X quit > /dev/null 2>&1
108 | 
109 |     $BASEDIR/deploy/kill_all.sh $service 1>> $LOGFILE 2>&1
110 |     $BASEDIR/deploy/kill_all.sh nameserver 1>> $LOGFILE 2>&1
111 |     check_result "$service" "$OUTFILE"
112 |     echo "RESULT: $RESULT" >> $LOGFILE
113 |     END=$(date)
114 |     echo "ending tests at $END" >> $LOGFILE
115 |     let "FAILED=FAILED+RESULT"
116 | done
117 | 
118 | echo "FAILED: $FAILED"
119 | 
120 | if [[ "$FAILED" == "0" ]]; then
121 |     exit 0
122 | else
123 |     exit 1
124 | fi
125 | 


--------------------------------------------------------------------------------