├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── batches
    │   └── sample_batch.py
    ├── grades.csv
    └── ssn-address.tsv
├── docker-compose.yml
├── init.sql
└── zeppelin_notebooks
    └── test_2FVBJBJ1V.zpln


/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS garbage
 2 | .DS_Store
 3 | desktop.ini
 4 | 
 5 | # IDE garbage
 6 | .idea/
 7 | 
 8 | # Livy batch files, copied over from elsewhere, except one sample batch
 9 | data/batches/*
10 | !data/batches/sample_batch.py
11 | 
12 | # Spark job results
13 | data/output/
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2020 Vadim Panov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Big data playground: Cluster with Hadoop, Hive, Spark, Zeppelin and Livy via Docker-compose.
  2 | 
  3 | I wanted to have the ability to play around with various big data
  4 | applications as effortlessly as possible,
  5 | namely those found in Amazon EMR.
  6 | Ideally, that would be something that can be brought up and torn down
  7 | in one command. This is how this repository came to be!
  8 | 
  9 | ## Constituent images:
 10 | 
 11 | [Base image](https://github.com/panovvv/hadoop-hive-spark-docker):
 12 | [![Docker Build Status: Base image](https://img.shields.io/docker/cloud/build/panovvv/hadoop-hive-spark.svg)](https://cloud.docker.com/repository/docker/panovvv/hadoop-hive-spark/builds)
 13 | [![Docker Pulls: Base image](https://img.shields.io/docker/pulls/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
 14 | [![Docker Stars: Base image](https://img.shields.io/docker/stars/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
 15 | 
 16 | [Zeppelin image](https://github.com/panovvv/zeppelin-bigdata-docker): [![Docker Build Status: Zeppelin](https://img.shields.io/docker/cloud/build/panovvv/zeppelin-bigdata.svg)](https://cloud.docker.com/repository/docker/panovvv/zeppelin-bigdata/builds)
 17 | [![Docker Pulls: Zeppelin](https://img.shields.io/docker/pulls/panovvv/zeppelin-bigdata.svg)](https://hub.docker.com/r/panovvv/zeppelin-bigdata)
 18 | [![Docker Stars: Zeppelin](https://img.shields.io/docker/stars/panovvv/zeppelin-bigdata.svg)](https://hub.docker.com/r/panovvv/zeppelin-bigdata)
 19 | 
 20 | [Livy image](https://github.com/panovvv/livy-docker): [![Docker Build Status: Livy](https://img.shields.io/docker/cloud/build/panovvv/livy.svg)](https://cloud.docker.com/repository/docker/panovvv/livy/builds)
 21 | [![Docker Pulls: Livy](https://img.shields.io/docker/pulls/panovvv/livy.svg)](https://hub.docker.com/r/panovvv/livy)
 22 | [![Docker Stars: Livy](https://img.shields.io/docker/stars/panovvv/livy.svg)](https://hub.docker.com/r/panovvv/livy)
 23 | 
 24 | ## Usage
 25 | 
 26 | Clone:
 27 | ```bash
 28 | git clone https://github.com/panovvv/bigdata-docker-compose.git
 29 | ```
 30 | * On non-Linux platforms, you should dedicate more RAM to Docker than it does by default
 31 |   (2Gb on my machine with 16Gb RAM). Otherwise applications (ResourceManager in my case)
 32 |   will quit sporadically and you'll see messages like this one in logs:
 33 |   <pre>
 34 |   current-datetime INFO org.apache.hadoop.util.JvmPauseMonitor: Detected pause in JVM or host machine (eg GC): pause of approximately 1234ms
 35 |   No GCs detected
 36 |   </pre>
 37 |   Increasing memory to 8G solved all those mysterious problems for me.
 38 | 
 39 | * You should have more than 90% of free disk space, otherwise
 40 |   YARN will deem all nodes unhealthy.
 41 | 
 42 | Bring everything up:
 43 | ```bash
 44 | cd bigdata-docker-compose
 45 | docker-compose up -d
 46 | ```
 47 | 
 48 | * **data/** directory is mounted into every container, you can use this as
 49 | a storage both for files you want to process using Hive/Spark/whatever
 50 | and results of those computations.
 51 | * **livy_batches/** directory is where you have some sample code for
 52 | Livy batch processing mode. It's mounted to the node where Livy
 53 | is running. You can store your code there as well, or make use of the
 54 | universal **data/**.
 55 | * **zeppelin_notebooks/** contains, quite predictably, notebook files
 56 | for Zeppelin. Thanks to that, all your notebooks persist across runs.
 57 | 
 58 | Hive JDBC port is exposed to host:
 59 | * URI: `jdbc:hive2://localhost:10000`
 60 | * Driver: `org.apache.hive.jdbc.HiveDriver` (org.apache.hive:hive-jdbc:3.1.2)
 61 | * User and password: unused.
 62 | 
 63 | To shut the whole thing down, run this from the same folder:
 64 | ```bash
 65 | docker-compose down
 66 | ```
 67 | 
 68 | ## Checking if everything plays well together
 69 | You can quickly check everything by opening the
 70 | [bundled Zeppelin notebook](http://localhost:8890)
 71 | and running all paragraphs.
 72 | 
 73 | Alternatively, to get a sense of
 74 | how it all works under the hood, follow the instructions below:
 75 | 
 76 | ### Hadoop and YARN:
 77 | 
 78 | Check [YARN (Hadoop ResourceManager) Web UI
 79 | (localhost:8088)](http://localhost:8088/).
 80 | You should see 2 active nodes there.
 81 | There's also an
 82 | [alternative YARN Web UI 2 (http://localhost:8088/ui2)](http://localhost:8088/ui2).
 83 | 
 84 | Then, [Hadoop Name Node UI (localhost:9870)](http://localhost:9870),
 85 | Hadoop Data Node UIs at
 86 | [http://localhost:9864](http://localhost:9864) and [http://localhost:9865](http://localhost:9865):
 87 | all of those URLs should result in a page.
 88 | 
 89 | Open up a shell in the master node.
 90 | ```bash
 91 | docker-compose exec master bash
 92 | jps
 93 | ```
 94 | `jps` command outputs a list of running Java processes,
 95 | which on Hadoop Namenode/Spark Master node should include those:
 96 | <pre>
 97 | 123 Jps
 98 | 456 ResourceManager
 99 | 789 NameNode
100 | 234 SecondaryNameNode
101 | 567 HistoryServer
102 | 890 Master
103 | </pre>
104 | 
105 | ... but not necessarily in this order and those IDs,
106 | also some extras like `RunJar` and `JobHistoryServer` might be there too.
107 | 
108 | Then let's see if YARN can see all resources we have (2 worker nodes):
109 | ```bash
110 | yarn node -list
111 | ```
112 | <pre>
113 | current-datetime INFO client.RMProxy: Connecting to ResourceManager at master/172.28.1.1:8032
114 | Total Nodes:2
115 |          Node-Id	     Node-State	Node-Http-Address	Number-of-Running-Containers
116 |    worker1:45019	        RUNNING	     worker1:8042	                           0
117 |    worker2:41001	        RUNNING	     worker2:8042	                           0
118 | </pre>
119 | 
120 | HDFS (Hadoop distributed file system) condition:
121 | ```bash
122 | hdfs dfsadmin -report
123 | ```
124 | <pre>
125 | Live datanodes (2):
126 | Name: 172.28.1.2:9866 (worker1)
127 | ...
128 | Name: 172.28.1.3:9866 (worker2)
129 | </pre>
130 | 
131 | Now we'll upload a file into HDFS and see that it's visible from all
132 | nodes:
133 | ```bash
134 | hadoop fs -put /data/grades.csv /
135 | hadoop fs -ls /
136 | ```
137 | <pre>
138 | Found N items
139 | ...
140 | -rw-r--r--   2 root supergroup  ... /grades.csv
141 | ...
142 | </pre>
143 | 
144 | Ctrl+D out of master now. Repeat for remaining nodes
145 | (there's 3 total: master, worker1 and worker2):
146 | 
147 | ```bash
148 | docker-compose exec worker1 bash
149 | hadoop fs -ls /
150 | ```
151 | <pre>
152 | Found 1 items
153 | -rw-r--r--   2 root supergroup  ... /grades.csv
154 | </pre>
155 | 
156 | While we're on nodes other than Hadoop Namenode/Spark Master node,
157 | jps command output should include DataNode and Worker now instead of
158 | NameNode and Master:
159 | ```bash
160 | jps
161 | ```
162 | <pre>
163 | 123 Jps
164 | 456 NodeManager
165 | 789 DataNode
166 | 234 Worker
167 | </pre>
168 | 
169 | ### Hive
170 | 
171 | Prerequisite: there's a file `grades.csv` stored in HDFS ( `hadoop fs -put /data/grades.csv /` )
172 | ```bash
173 | docker-compose exec master bash
174 | hive
175 | ```
176 | ```sql
177 | CREATE TABLE grades(
178 |     `Last name` STRING,
179 |     `First name` STRING,
180 |     `SSN` STRING,
181 |     `Test1` DOUBLE,
182 |     `Test2` INT,
183 |     `Test3` DOUBLE,
184 |     `Test4` DOUBLE,
185 |     `Final` DOUBLE,
186 |     `Grade` STRING)
187 | COMMENT 'https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html'
188 | ROW FORMAT DELIMITED
189 | FIELDS TERMINATED BY ','
190 | STORED AS TEXTFILE
191 | tblproperties("skip.header.line.count"="1");
192 | 
193 | LOAD DATA INPATH '/grades.csv' INTO TABLE grades;
194 | 
195 | SELECT * FROM grades;
196 | -- OK
197 | -- Alfalfa	Aloysius	123-45-6789	40.0	90	100.0	83.0	49.0	D-
198 | -- Alfred	University	123-12-1234	41.0	97	96.0	97.0	48.0	D+
199 | -- Gerty	Gramma	567-89-0123	41.0	80	60.0	40.0	44.0	C
200 | -- Android	Electric	087-65-4321	42.0	23	36.0	45.0	47.0	B-
201 | -- Bumpkin	Fred	456-78-9012	43.0	78	88.0	77.0	45.0	A-
202 | -- Rubble	Betty	234-56-7890	44.0	90	80.0	90.0	46.0	C-
203 | -- Noshow	Cecil	345-67-8901	45.0	11	-1.0	4.0	43.0	F
204 | -- Buff	Bif	632-79-9939	46.0	20	30.0	40.0	50.0	B+
205 | -- Airpump	Andrew	223-45-6789	49.0	1	90.0	100.0	83.0	A
206 | -- Backus	Jim	143-12-1234	48.0	1	97.0	96.0	97.0	A+
207 | -- Carnivore	Art	565-89-0123	44.0	1	80.0	60.0	40.0	D+
208 | -- Dandy	Jim	087-75-4321	47.0	1	23.0	36.0	45.0	C+
209 | -- Elephant	Ima	456-71-9012	45.0	1	78.0	88.0	77.0	B-
210 | -- Franklin	Benny	234-56-2890	50.0	1	90.0	80.0	90.0	B-
211 | -- George	Boy	345-67-3901	40.0	1	11.0	-1.0	4.0	B
212 | -- Heffalump	Harvey	632-79-9439	30.0	1	20.0	30.0	40.0	C
213 | -- Time taken: 3.324 seconds, Fetched: 16 row(s)
214 | ```
215 | 
216 | Ctrl+D back to bash. Check if the file's been loaded to Hive warehouse
217 | directory:
218 | 
219 | ```bash
220 | hadoop fs -ls /usr/hive/warehouse/grades
221 | ```
222 | <pre>
223 | Found 1 items
224 | -rw-r--r--   2 root supergroup  ... /usr/hive/warehouse/grades/grades.csv
225 | </pre>
226 | 
227 | The table we just created should be accessible from all nodes, let's
228 | verify that now:
229 | ```bash
230 | docker-compose exec worker2 bash
231 | hive
232 | ```
233 | ```sql
234 | SELECT * FROM grades;
235 | ```
236 | You should be able to see the same table.
237 | ### Spark
238 | 
239 | Open up [Spark Master Web UI (localhost:8080)](http://localhost:8080/):
240 | <pre>
241 | Workers (2)
242 | Worker Id	Address	State	Cores	Memory
243 | worker-timestamp-172.28.1.3-8882	172.28.1.3:8882	ALIVE	2 (0 Used)	1024.0 MB (0.0 B Used)
244 | worker-timestamp-172.28.1.2-8881	172.28.1.2:8881	ALIVE	2 (0 Used)	1024.0 MB (0.0 B Used)
245 | </pre>
246 | , also worker UIs at  [localhost:8081](http://localhost:8081/)
247 | and  [localhost:8082](http://localhost:8082/). All those pages should be
248 | accessible.
249 | 
250 | Then there's also Spark History server running at
251 | [localhost:18080](http://localhost:18080/) - every time you run Spark jobs, you
252 | will see them here.
253 | 
254 | History Server includes REST API at
255 | [localhost:18080/api/v1/applications](http://localhost:18080/api/v1/applications).
256 | This is a mirror of everything on the main page, only in JSON format.
257 | 
258 | Let's run some sample jobs now:
259 | ```bash
260 | docker-compose exec master bash
261 | run-example SparkPi 10
262 | #, or you can do the same via spark-submit:
263 | spark-submit --class org.apache.spark.examples.SparkPi \
264 |     --master yarn \
265 |     --deploy-mode client \
266 |     --driver-memory 2g \
267 |     --executor-memory 1g \
268 |     --executor-cores 1 \
269 |     $SPARK_HOME/examples/jars/spark-examples*.jar \
270 |     10
271 | ```
272 | <pre>
273 | INFO spark.SparkContext: Running Spark version 2.4.4
274 | INFO spark.SparkContext: Submitted application: Spark Pi
275 | ..
276 | INFO client.RMProxy: Connecting to ResourceManager at master/172.28.1.1:8032
277 | INFO yarn.Client: Requesting a new application from cluster with 2 NodeManagers
278 | ...
279 | INFO yarn.Client: Application report for application_1567375394688_0001 (state: ACCEPTED)
280 | ...
281 | INFO yarn.Client: Application report for application_1567375394688_0001 (state: RUNNING)
282 | ...
283 | INFO scheduler.DAGScheduler: Job 0 finished: reduce at SparkPi.scala:38, took 1.102882 s
284 | Pi is roughly 3.138915138915139
285 | ...
286 | INFO util.ShutdownHookManager: Deleting directory /tmp/spark-81ea2c22-d96e-4d7c-a8d7-9240d8eb22ce
287 | </pre>
288 | 
289 | Spark has 3 interactive shells: spark-shell to code in Scala,
290 | pyspark for Python and sparkR for R. Let's try them all out:
291 | ```bash
292 | hadoop fs -put /data/grades.csv /
293 | spark-shell
294 | ```
295 | ```scala
296 | spark.range(1000 * 1000 * 1000).count()
297 | 
298 | val df = spark.read.format("csv").option("header", "true").load("/grades.csv")
299 | df.show()
300 | 
301 | df.createOrReplaceTempView("df")
302 | spark.sql("SHOW TABLES").show()
303 | spark.sql("SELECT * FROM df WHERE Final > 50").show()
304 | 
305 | //TODO SELECT TABLE from hive - not working for now.
306 | spark.sql("SELECT * FROM grades").show()
307 | ```
308 | <pre>
309 | Spark context Web UI available at http://localhost:4040
310 | Spark context available as 'sc' (master = yarn, app id = application_N).
311 | Spark session available as 'spark'.
312 | 
313 | res0: Long = 1000000000
314 | 
315 | df: org.apache.spark.sql.DataFrame = [Last name: string, First name: string ... 7 more fields]
316 | 
317 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
318 | |Last name|First name|        SSN|Test1|Test2|Test3|Test4|Final|Grade|
319 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
320 | |  Alfalfa|  Aloysius|123-45-6789|   40|   90|  100|   83|   49|   D-|
321 | ...
322 | |Heffalump|    Harvey|632-79-9439|   30|    1|   20|   30|   40|    C|
323 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
324 | 
325 | +--------+---------+-----------+
326 | |database|tableName|isTemporary|
327 | +--------+---------+-----------+
328 | |        |       df|       true|
329 | +--------+---------+-----------+
330 | 
331 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
332 | |Last name|First name|        SSN|Test1|Test2|Test3|Test4|Final|Grade|
333 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
334 | |  Airpump|    Andrew|223-45-6789|   49|    1|   90|  100|   83|    A|
335 | |   Backus|       Jim|143-12-1234|   48|    1|   97|   96|   97|   A+|
336 | | Elephant|       Ima|456-71-9012|   45|    1|   78|   88|   77|   B-|
337 | | Franklin|     Benny|234-56-2890|   50|    1|   90|   80|   90|   B-|
338 | +---------+----------+-----------+-----+-----+-----+-----+-----+-----+
339 | </pre>
340 | Ctrl+D out of Scala shell now.
341 | 
342 | ```bash
343 | pyspark
344 | ```
345 | ```python
346 | spark.range(1000 * 1000 * 1000).count()
347 | 
348 | df = spark.read.format('csv').option('header', 'true').load('/grades.csv')
349 | df.show()
350 | 
351 | df.createOrReplaceTempView('df')
352 | spark.sql('SHOW TABLES').show()
353 | spark.sql('SELECT * FROM df WHERE Final > 50').show()
354 | 
355 | # TODO SELECT TABLE from hive - not working for now.
356 | spark.sql('SELECT * FROM grades').show()
357 | ```
358 | <pre>
359 | 1000000000
360 | 
361 | $same_tables_as_above
362 | </pre>
363 | Ctrl+D out of PySpark.
364 | 
365 | ```bash
366 | sparkR
367 | ```
368 | ```R
369 | df <- as.DataFrame(list("One", "Two", "Three", "Four"), "This is as example")
370 | head(df)
371 | 
372 | df <- read.df("/grades.csv", "csv", header="true")
373 | head(df)
374 | ```
375 | <pre>
376 |   This is as example
377 | 1                One
378 | 2                Two
379 | 3              Three
380 | 4               Four
381 | 
382 | $same_tables_as_above
383 | </pre>
384 | 
385 | * Amazon S3
386 | 
387 | From Hadoop:
388 | ```bash
389 | hadoop fs -Dfs.s3a.impl="org.apache.hadoop.fs.s3a.S3AFileSystem" -Dfs.s3a.access.key="classified" -Dfs.s3a.secret.key="classified" -ls "s3a://bucket"
390 | ```
391 | 
392 | Then from PySpark:
393 | 
394 | ```python
395 | sc._jsc.hadoopConfiguration().set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
396 | sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'classified')
397 | sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'classified')
398 | 
399 | df = spark.read.format('csv').option('header', 'true').option('sep', '\t').load('s3a://bucket/tabseparated_withheader.tsv')
400 | df.show(5)
401 | ```
402 | 
403 | None of the commands above stores your credentials anywhere
404 | (i.e. as soon as you'd shut down the cluster your creds are safe). More
405 | persistent ways of storing the credentials are out of scope of this
406 | readme.
407 | 
408 | ### Zeppelin
409 | 
410 | Zeppelin interface should be available at [http://localhost:8890](http://localhost:8890).
411 | 
412 | You'll find a notebook called "test" in there, containing commands
413 | to test integration with bash, Spark and Livy.
414 | 
415 | ### Livy
416 | 
417 | Livy is at [http://localhost:8998](http://localhost:8998) (and yes,
418 | there's a web UI as well as REST API on that port - just click the link).
419 | 
420 | * Livy Sessions.
421 | 
422 | Try to poll the REST API:
423 | ```bash
424 | curl --request GET \
425 |   --url http://localhost:8998/sessions | python3 -mjson.tool
426 | ```
427 | The response, assuming you didn't create any sessions before, should look like this:
428 | ```json
429 | {
430 |   "from": 0,
431 |   "total": 0,
432 |   "sessions": []
433 | }
434 | ```
435 | 
436 | 1 ) Create a session:
437 | ```bash
438 | curl --request POST \
439 |   --url http://localhost:8998/sessions \
440 |   --header 'content-type: application/json' \
441 |   --data '{
442 | 	"kind": "pyspark"
443 | }' | python3 -mjson.tool
444 | ```
445 | Response:
446 | ```json
447 | {
448 |     "id": 0,
449 |     "name": null,
450 |     "appId": null,
451 |     "owner": null,
452 |     "proxyUser": null,
453 |     "state": "starting",
454 |     "kind": "pyspark",
455 |     "appInfo": {
456 |         "driverLogUrl": null,
457 |         "sparkUiUrl": null
458 |     },
459 |     "log": [
460 |         "stdout: ",
461 |         "\nstderr: ",
462 |         "\nYARN Diagnostics: "
463 |     ]
464 | }
465 | ```
466 | 
467 | 2 ) Wait for session to start (state will transition from "starting"
468 | to "idle"):
469 | ```bash
470 | curl --request GET \
471 |   --url http://localhost:8998/sessions/0 | python3 -mjson.tool
472 | ```
473 | Response:
474 | ```json
475 | {
476 |     "id": 0,
477 |     "name": null,
478 |     "appId": "application_1584274334558_0001",
479 |     "owner": null,
480 |     "proxyUser": null,
481 |     "state": "starting",
482 |     "kind": "pyspark",
483 |     "appInfo": {
484 |         "driverLogUrl": "http://worker2:8042/node/containerlogs/container_1584274334558_0003_01_000001/root",
485 |         "sparkUiUrl": "http://master:8088/proxy/application_1584274334558_0003/"
486 |     },
487 |     "log": [
488 |         "timestamp bla"
489 |     ]
490 | }
491 | ```
492 | 
493 | 3 ) Post some statements:
494 | ```bash
495 | curl --request POST \
496 |   --url http://localhost:8998/sessions/0/statements \
497 |   --header 'content-type: application/json' \
498 |   --data '{
499 | 	"code": "import sys;print(sys.version)"
500 | }' | python3 -mjson.tool
501 | curl --request POST \
502 |   --url http://localhost:8998/sessions/0/statements \
503 |   --header 'content-type: application/json' \
504 |   --data '{
505 | 	"code": "spark.range(1000 * 1000 * 1000).count()"
506 | }' | python3 -mjson.tool
507 | ```
508 | Response:
509 | ```json
510 | {
511 |     "id": 0,
512 |     "code": "import sys;print(sys.version)",
513 |     "state": "waiting",
514 |     "output": null,
515 |     "progress": 0.0,
516 |     "started": 0,
517 |     "completed": 0
518 | }
519 | ```
520 | ```json
521 | {
522 |     "id": 1,
523 |     "code": "spark.range(1000 * 1000 * 1000).count()",
524 |     "state": "waiting",
525 |     "output": null,
526 |     "progress": 0.0,
527 |     "started": 0,
528 |     "completed": 0
529 | }
530 | ```
531 | 
532 | 4) Get the result:
533 | ```bash
534 | curl --request GET \
535 |   --url http://localhost:8998/sessions/0/statements | python3 -mjson.tool
536 | ```
537 | Response:
538 | ```json
539 | {
540 |   "total_statements": 2,
541 |   "statements": [
542 |     {
543 |       "id": 0,
544 |       "code": "import sys;print(sys.version)",
545 |       "state": "available",
546 |       "output": {
547 |         "status": "ok",
548 |         "execution_count": 0,
549 |         "data": {
550 |           "text/plain": "3.7.3 (default, Apr  3 2019, 19:16:38) \n[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]"
551 |         }
552 |       },
553 |       "progress": 1.0
554 |     },
555 |     {
556 |       "id": 1,
557 |       "code": "spark.range(1000 * 1000 * 1000).count()",
558 |       "state": "available",
559 |       "output": {
560 |         "status": "ok",
561 |         "execution_count": 1,
562 |         "data": {
563 |           "text/plain": "1000000000"
564 |         }
565 |       },
566 |       "progress": 1.0
567 |     }
568 |   ]
569 | }
570 | ```
571 | 
572 | 5) Delete the session:
573 | ```bash
574 | curl --request DELETE \
575 |   --url http://localhost:8998/sessions/0 | python3 -mjson.tool
576 | ```
577 | Response:
578 | ```json
579 | {
580 |   "msg": "deleted"
581 | }
582 | ```
583 | * Livy Batches.
584 | 
585 | To get all active batches:
586 | ```bash
587 | curl --request GET \
588 |   --url http://localhost:8998/batches | python3 -mjson.tool
589 | ```
590 | Strange enough, this elicits the same response as if we were querying
591 | the sessions endpoint, but ok...
592 | 
593 | 1 ) Send the batch:
594 | ```bash
595 | curl --request POST \
596 |   --url http://localhost:8998/batches \
597 |   --header 'content-type: application/json' \
598 |   --data '{
599 | 	"file": "local:/data/batches/sample_batch.py",
600 | 	"pyFiles": [
601 | 		"local:/data/batches/sample_batch.py"
602 | 	],
603 | 	"args": [
604 | 		"123"
605 | 	]
606 | }' | python3 -mjson.tool
607 | ```
608 | Response:
609 | ```json
610 | {
611 |     "id": 0,
612 |     "name": null,
613 |     "owner": null,
614 |     "proxyUser": null,
615 |     "state": "starting",
616 |     "appId": null,
617 |     "appInfo": {
618 |         "driverLogUrl": null,
619 |         "sparkUiUrl": null
620 |     },
621 |     "log": [
622 |         "stdout: ",
623 |         "\nstderr: ",
624 |         "\nYARN Diagnostics: "
625 |     ]
626 | }
627 | ```
628 | 
629 | 2 ) Query the status:
630 | ```bash
631 | curl --request GET \
632 |   --url http://localhost:8998/batches/0 | python3 -mjson.tool
633 | ```
634 | Response:
635 | ```json
636 | {
637 |     "id": 0,
638 |     "name": null,
639 |     "owner": null,
640 |     "proxyUser": null,
641 |     "state": "running",
642 |     "appId": "application_1584274334558_0005",
643 |     "appInfo": {
644 |         "driverLogUrl": "http://worker2:8042/node/containerlogs/container_1584274334558_0005_01_000001/root",
645 |         "sparkUiUrl": "http://master:8088/proxy/application_1584274334558_0005/"
646 |     },
647 |     "log": [
648 |         "timestamp bla",
649 |         "\nstderr: ",
650 |         "\nYARN Diagnostics: "
651 |     ]
652 | }
653 | ```
654 | 
655 | 3 ) To see all log lines, query the `/log` endpoint.
656 | You can skip 'to' and 'from' params, or manipulate them to get all log lines.
657 | Livy (as of 0.7.0) supports no more than 100 log lines per response.
658 | ```bash
659 | curl --request GET \
660 |   --url 'http://localhost:8998/batches/0/log?from=100&to=200' | python3 -mjson.tool
661 | ```
662 | Response:
663 | ```json
664 | {
665 |     "id": 0,
666 |     "from": 100,
667 |     "total": 203,
668 |     "log": [
669 |         "...",
670 |         "Welcome to",
671 |         "      ____              __",
672 |         "     / __/__  ___ _____/ /__",
673 |         "    _\\ \\/ _ \\/ _ `/ __/  '_/",
674 |         "   /__ / .__/\\_,_/_/ /_/\\_\\   version 2.4.5",
675 |         "      /_/",
676 |         "",
677 |         "Using Python version 3.7.5 (default, Oct 17 2019 12:25:15)",
678 |         "SparkSession available as 'spark'.",
679 |         "3.7.5 (default, Oct 17 2019, 12:25:15) ",
680 |         "[GCC 8.3.0]",
681 |         "Arguments: ",
682 |         "['/data/batches/sample_batch.py', '123']",
683 |         "Custom number passed in args: 123",
684 |         "Will raise 123 to the power of 3...",
685 |         "...",
686 |         "123 ^ 3 = 1860867",
687 |         "...",
688 |         "2020-03-15 13:06:09,503 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-138164b7-c5dc-4dc5-be6b-7a49c6bcdff0/pyspark-4d73b7c7-e27c-462f-9e5a-96011790d059"
689 |     ]
690 | }
691 | ```
692 | 
693 | 4 ) Delete the batch:
694 | ```bash
695 | curl --request DELETE \
696 |   --url http://localhost:8998/batches/0 | python3 -mjson.tool
697 | ```
698 | Response:
699 | ```json
700 | {
701 |   "msg": "deleted"
702 | }
703 | ```
704 | 
705 | ## Credits
706 | Sample data file:
707 | * __grades.csv__ is borrowed from 
708 | [John Burkardt's page](https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html)
709 | under Florida State University domain. Thanks for
710 | sharing those!
711 | 
712 | * __ssn-address.tsv__ is derived from  __grades.csv__ by removing some fields
713 |   and adding randomly-generated addresses.
714 |   
715 | 


--------------------------------------------------------------------------------
/data/batches/sample_batch.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark.shell import spark
 4 | 
 5 | print(sys.version)
 6 | print("Arguments: \n" + str(sys.argv))
 7 | 
 8 | try:
 9 |     num = int(sys.argv[1])
10 |     print("Custom number passed in args: " + str(num))
11 | except (ValueError, IndexError):
12 |     num = 1000
13 |     print("Can't process as number: " + sys.argv[1])
14 | 
15 | # Checking if f-string are available (python>=3.6)
16 | print(f"Will raise {num} to the power of 3...")
17 | 
18 | cube = spark.range(num * num * num).count()
19 | print(f"{num} ^ 3 = {cube}")
20 | 


--------------------------------------------------------------------------------
/data/grades.csv:
--------------------------------------------------------------------------------
 1 | Last name,First name,SSN,Test1,Test2,Test3,Test4,Final,Grade
 2 | Alfalfa,Aloysius,123-45-6789,40,90,100,83,49,D-
 3 | Alfred,University,123-12-1234,41,97,96,97,48,D+
 4 | Gerty,Gramma,567-89-0123,41,80,60,40,44,C
 5 | Android,Electric,087-65-4321,42,23,36,45,47,B-
 6 | Bumpkin,Fred,456-78-9012,43,78,88,77,45,A-
 7 | Rubble,Betty,234-56-7890,44,90,80,90,46,C-
 8 | Noshow,Cecil,345-67-8901,45,11,-1,4,43,F
 9 | Buff,Bif,632-79-9939,46,20,30,40,50,B+
10 | Airpump,Andrew,223-45-6789,49,1,90,100,83,A
11 | Backus,Jim,143-12-1234,48,1,97,96,97,A+
12 | Carnivore,Art,565-89-0123,44,1,80,60,40,D+
13 | Dandy,Jim,087-75-4321,47,1,23,36,45,C+
14 | Elephant,Ima,456-71-9012,45,1,78,88,77,B-
15 | Franklin,Benny,234-56-2890,50,1,90,80,90,B-
16 | George,Boy,345-67-3901,40,1,11,-1,4,B
17 | Heffalump,Harvey,632-79-9439,30,1,20,30,40,C
18 | 


--------------------------------------------------------------------------------
/data/ssn-address.tsv:
--------------------------------------------------------------------------------
 1 | Alfalfa	Aloysius	123-45-6789	7098 East Road	Hopkins, MN 55343
 2 | Backus  Jim	143-12-1234	603 Wagon Drive	Miamisburg, OH 45342
 3 | Dandy	Jim	087-75-4321	4 Ann St.	Hackensack, NJ 07601
 4 | George	Boy	345-67-3901	13 Foxrun Ave.	Annandale, VA 22003
 5 | Alfred	University	123-12-1234	98 Wellington Ave.	Lowell, MA 01851
 6 | Elephant	Ima	456-71-9012	
 7 | Heffalump	Harvey	632-79-9439	5 Beech Street	Canyon Country, CA 91387
 8 | Gerty	Gramma	567-89-0123	
 9 | Rubble	Betty	234-56-7890	9715 Penn St.	Royal Oak, MI 48067
10 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.7"
  2 | services:
  3 |   hivemetastore:
  4 |     image: postgres:11.5
  5 |     hostname: hivemetastore
  6 |     environment:
  7 |       POSTGRES_PASSWORD: new_password
  8 |     expose:
  9 |       - 5432
 10 |     volumes:
 11 |       - ./init.sql:/docker-entrypoint-initdb.d/init.sql
 12 |     healthcheck:
 13 |       test: ["CMD-SHELL", "pg_isready -U postgres"]
 14 |       interval: 10s
 15 |       timeout: 5s
 16 |       retries: 5
 17 |     networks:
 18 |       spark_net:
 19 |         ipv4_address: 172.28.1.4
 20 |     extra_hosts:
 21 |       - "master:172.28.1.1"
 22 |       - "worker1:172.28.1.2"
 23 |       - "worker2:172.28.1.3"
 24 |       - "zeppelin:172.28.1.5"
 25 |       - "livy:172.28.1.6"
 26 | 
 27 |   master:
 28 |     image: panovvv/hadoop-hive-spark:2.5.2
 29 | #    build: '../hadoop-hive-spark-docker'
 30 |     hostname: master
 31 |     depends_on:
 32 |       - hivemetastore
 33 |     environment:
 34 |       HADOOP_NODE: namenode
 35 |       HIVE_CONFIGURE: yes, please
 36 |       SPARK_PUBLIC_DNS: localhost
 37 |       SPARK_LOCAL_IP: 172.28.1.1
 38 |       SPARK_MASTER_HOST: 172.28.1.1
 39 |       SPARK_LOCAL_HOSTNAME: master
 40 |     expose:
 41 |       - 1-65535
 42 |     ports:
 43 |       # Spark Master Web UI
 44 |       - 8080:8080
 45 |       # Spark job Web UI: increments for each successive job
 46 |       - 4040:4040
 47 |       - 4041:4041
 48 |       - 4042:4042
 49 |       - 4043:4043
 50 |       # Spark History server
 51 |       - 18080:18080
 52 |       # YARN UI
 53 |       - 8088:8088
 54 |       # Hadoop namenode UI
 55 |       - 9870:9870
 56 |       # Hadoop secondary namenode UI
 57 |       - 9868:9868
 58 |       # Hive JDBC
 59 |       - 10000:10000
 60 |     volumes:
 61 |       - ./data:/data
 62 |     networks:
 63 |       spark_net:
 64 |         ipv4_address: 172.28.1.1
 65 |     extra_hosts:
 66 |       - "worker1:172.28.1.2"
 67 |       - "worker2:172.28.1.3"
 68 |       - "hivemetastore:172.28.1.4"
 69 |       - "zeppelin:172.28.1.5"
 70 |       - "livy:172.28.1.6"
 71 | 
 72 |   worker1:
 73 |     image: panovvv/hadoop-hive-spark:2.5.2
 74 | #    build: '../hadoop-hive-spark-docker'
 75 |     hostname: worker1
 76 |     depends_on:
 77 |       - hivemetastore
 78 |     environment:
 79 |       SPARK_MASTER_ADDRESS: spark://master:7077
 80 |       SPARK_WORKER_PORT: 8881
 81 |       SPARK_WORKER_WEBUI_PORT: 8081
 82 |       SPARK_PUBLIC_DNS: localhost
 83 |       SPARK_LOCAL_HOSTNAME: worker1
 84 |       SPARK_LOCAL_IP: 172.28.1.2
 85 |       SPARK_MASTER_HOST: 172.28.1.1
 86 |       HADOOP_NODE: datanode
 87 |     expose:
 88 |       - 1-65535
 89 |     ports:
 90 |       # Hadoop datanode UI
 91 |       - 9864:9864
 92 |       #Spark worker UI
 93 |       - 8081:8081
 94 |     volumes:
 95 |       - ./data:/data
 96 |     networks:
 97 |       spark_net:
 98 |         ipv4_address: 172.28.1.2
 99 |     extra_hosts:
100 |       - "master:172.28.1.1"
101 |       - "worker2:172.28.1.3"
102 |       - "hivemetastore:172.28.1.4"
103 |       - "zeppelin:172.28.1.5"
104 |       - "livy:172.28.1.6"
105 | 
106 |   worker2:
107 |     image: panovvv/hadoop-hive-spark:2.5.2
108 | #    build: '../hadoop-hive-spark-docker'
109 |     hostname: worker2
110 |     depends_on:
111 |       - hivemetastore
112 |     environment:
113 |       SPARK_MASTER_ADDRESS: spark://master:7077
114 |       SPARK_WORKER_PORT: 8882
115 |       SPARK_WORKER_WEBUI_PORT: 8082
116 |       SPARK_PUBLIC_DNS: localhost
117 |       SPARK_LOCAL_HOSTNAME: worker2
118 |       SPARK_LOCAL_IP: 172.28.1.3
119 |       SPARK_MASTER_HOST: 172.28.1.1
120 |       HADOOP_NODE: datanode
121 |       HADOOP_DATANODE_UI_PORT: 9865
122 |     expose:
123 |       - 1-65535
124 |     ports:
125 |       # Hadoop datanode UI
126 |       - 9865:9865
127 |       # Spark worker UI
128 |       - 8082:8082
129 |     volumes:
130 |       - ./data:/data
131 |     networks:
132 |       spark_net:
133 |         ipv4_address: 172.28.1.3
134 |     extra_hosts:
135 |       - "master:172.28.1.1"
136 |       - "worker1:172.28.1.2"
137 |       - "hivemetastore:172.28.1.4"
138 |       - "zeppelin:172.28.1.5"
139 |       - "livy:172.28.1.6"
140 | 
141 |   livy:
142 |     image: panovvv/livy:2.5.2
143 | #    build: '../livy-docker'
144 |     hostname: livy
145 |     depends_on:
146 |       - master
147 |       - worker1
148 |       - worker2
149 |     volumes:
150 |       - ./livy_batches:/livy_batches
151 |       - ./data:/data
152 |     environment:
153 |       - SPARK_MASTER=yarn
154 |       # Intentionally not specified - if it's set here, then we can't override it
155 |       # via REST API ("conf"={} map)
156 |       # Can be client or cluster
157 | #      - SPARK_DEPLOY_MODE=client
158 | 
159 |       - LOCAL_DIR_WHITELIST=/data/batches/
160 |       - ENABLE_HIVE_CONTEXT=false
161 |       # Defaults are fine for variables below. Uncomment to change them.
162 | #      - LIVY_HOST=0.0.0.0
163 | #      - LIVY_PORT=8998
164 |     expose:
165 |       - 1-65535
166 |     ports:
167 |       - 8998:8998
168 |     networks:
169 |       spark_net:
170 |         ipv4_address: 172.28.1.6
171 |     extra_hosts:
172 |       - "master:172.28.1.1"
173 |       - "worker1:172.28.1.2"
174 |       - "worker2:172.28.1.3"
175 |       - "hivemetastore:172.28.1.4"
176 |       - "zeppelin:172.28.1.5"
177 | 
178 |   zeppelin:
179 |     image: panovvv/zeppelin-bigdata:2.5.2
180 | #    build: '../zeppelin-bigdata-docker'
181 |     hostname: zeppelin
182 |     depends_on:
183 |       - master
184 |       - worker1
185 |       - worker2
186 |       - livy
187 |     volumes:
188 |       - ./zeppelin_notebooks:/zeppelin_notebooks
189 |       - ./data:/data
190 |     environment:
191 |       ZEPPELIN_PORT: 8890
192 |       ZEPPELIN_NOTEBOOK_DIR: '/zeppelin_notebooks'
193 |     expose:
194 |       - 8890
195 |     ports:
196 |       - 8890:8890
197 |     networks:
198 |       spark_net:
199 |         ipv4_address: 172.28.1.5
200 |     extra_hosts:
201 |       - "master:172.28.1.1"
202 |       - "worker1:172.28.1.2"
203 |       - "worker2:172.28.1.3"
204 |       - "hivemetastore:172.28.1.4"
205 |       - "livy:172.28.1.6"
206 | 
207 | networks:
208 |   spark_net:
209 |     ipam:
210 |       driver: default
211 |       config:
212 |         - subnet: 172.28.0.0/16


--------------------------------------------------------------------------------
/init.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE "hivemetastoredb";


--------------------------------------------------------------------------------
/zeppelin_notebooks/test_2FVBJBJ1V.zpln:
--------------------------------------------------------------------------------
 1 | {
 2 |   "paragraphs": [
 3 |         {
 4 |           "text": "%sh\n\n#Only load a file to HDFS if it\u0027s not already there - because of this you can run all paragraphs as many times as you like.\nhadoop fs -test -e /grades.csv\n\nif ! hadoop fs -test -e /grades.csv\nthen\n    echo \"*******************************************\"\n    echo \"grades.csv is not in HDFS yet! Uploading...\"\n    echo \"*******************************************\"d\n    hadoop fs -put /data/grades.csv /\nfi"
 5 |         },
 6 |         {
 7 |           "text": "%sh\n\nhadoop fs -ls /"
 8 |         },
 9 |         {
10 |           "text": "%jdbc\n\n-- Does not support more than one statement per paragraph, it seems. Same goes for semicolon at the end of statements - errors out if you include it.\nDROP TABLE IF EXISTS grades"
11 |         },
12 |         {
13 |           "text": "%jdbc\n\nCREATE TABLE grades(\n    `Last name` STRING,\n    `First name` STRING,\n    `SSN` STRING,\n    `Test1` DOUBLE,\n    `Test2` INT,\n    `Test3` DOUBLE,\n    `Test4` DOUBLE,\n    `Final` DOUBLE,\n    `Grade` STRING)\nCOMMENT \u0027https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html\u0027\nROW FORMAT DELIMITED\nFIELDS TERMINATED BY \u0027,\u0027\nSTORED AS TEXTFILE\ntblproperties(\"skip.header.line.count\"\u003d\"1\")"
14 |         },
15 |         {
16 |           "text": "%jdbc\n\nLOAD DATA INPATH \u0027/grades.csv\u0027 INTO TABLE grades"
17 |         },
18 |         {
19 |           "text": "%jdbc\n\nSELECT * FROM grades"
20 |         },
21 |         {
22 |           "text": "%sh\n\n# Take a look at the warehouse directory, specifically where our Hive table is stored.\n hadoop fs -ls /usr/hive/warehouse/grades"
23 |         },
24 |         {
25 |           "text": "%sh\n\n# Put the file back into HDFS - it was moved to warehouse directory when we loaded it with Hive.\nhadoop fs -put /data/grades.csv /\nhadoop fs -ls /"
26 |         },
27 |         {
28 |           "text": "%spark\n\n// Basic Spark functions\nspark.range(1000 * 1000 * 1000).count()"
29 |         },
30 |         {
31 |           "text": "%spark\n\n// Dataframes\nval df \u003d Seq(\n  (\"One\", 1),\n  (\"Two\", 2),\n  (\"Three\", 3),\n  (\"Four\", 4)\n).toDF(\"This is\", \"an example\")\ndf.show()"
32 |         },
33 |         {
34 |           "text": "%spark\n\n// Read CSV file from HDFS into Dataframe\nval df \u003d spark.read.format(\"csv\").option(\"header\", \"true\").load(\"/grades.csv\")\ndf.show()"
35 |         },
36 |         {
37 |           "text": "%spark\n\n// Spark SQL and temporary views\ndf.createOrReplaceTempView(\"df\")\nspark.sql(\"SHOW TABLES\").show()"
38 |         },
39 |         {
40 |           "text": "%spark\n\nspark.sql(\"SELECT * FROM df WHERE Final > 50\").show()"
41 |         },
42 |         {
43 |           "text": "%spark.pyspark\n\n# Check Python version - 2 not allowed.\nimport sys\nprint(sys.version)"
44 |         },
45 |         {
46 |           "text": "%spark.pyspark\n\n#  Basic Spark functions\nspark.range(1000 * 1000 * 1000).count()"
47 |         },
48 |         {
49 |           "text": "%spark.pyspark\n\n# Dataframes\ndf \u003d sqlContext.createDataFrame([(\"One\", 1), (\"Two\", 2), (\"Three\", 3), (\"Four\", 4)], (\"This is\", \"an example\"))\ndf.show()"
50 |         },
51 |         {
52 |           "text": "%spark.pyspark\n\n# Read CSV file from HDFS into Dataframe\ndf \u003d spark.read.format(\"csv\").option(\"header\", \"true\").load(\"/grades.csv\")\ndf.show()"
53 |         },
54 |         {
55 |           "text": "%spark.r\n\n# Dataframes\ndf \u003c- as.DataFrame(list(\"One\", \"Two\", \"Three\", \"Four\"), \"This is as example\")\nhead(df)"
56 |         },
57 |         {
58 |           "text": "%spark.r\n\n# Read CSV file from HDFS into Dataframe\ndf \u003c- read.df(\"/grades.csv\", \"csv\", header\u003d\"true\")\nhead(df)"
59 |         },
60 |         {
61 |           "text": "%livy\n\n// Scala Spark over Livy\nspark.range(1000 * 1000 * 1000).count()"
62 |         },
63 |         {
64 |           "text": "%livy.pyspark\n\n#  PySpark over Livy\nimport sys\nprint(sys.version)\nspark.range(1000 * 1000 * 1000).count()"
65 |         },
66 |         {
67 |           "text": "%livy.sparkr\n\n# SparkR over Livy\ndf \u003c- as.DataFrame(list(\"One\", \"Two\", \"Three\", \"Four\"), \"This is as example\")\nhead(df)"
68 |         },
69 |         {
70 |           "text": "%livy.sql\nSELECT 1, CONCAT(\u0027This is\u0027, \u0027 a test\u0027)"
71 |         }
72 |       ],
73 |   "name": "test",
74 |   "id": "2FVBJBJ1V",
75 |   "defaultInterpreterGroup": "spark",
76 |   "version": "0.9.0",
77 |   "noteParams": {},
78 |   "noteForms": {},
79 |   "angularObjects": {},
80 |   "config": {
81 |     "isZeppelinNotebookCronEnable": false
82 |   },
83 |   "info": {}
84 | }
85 | 


--------------------------------------------------------------------------------