├── jars └── stub ├── run.sh ├── Makefile ├── profiles ├── clusters ├── README.md ├── Dockerfile ├── docker-compose.yml └── application.conf /jars/stub: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | /opt/spark-notebook/bin/spark-notebook -Dconfig.file=/opt/spark-notebook/conf/application.conf 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | up: 2 | docker network create spark-net 3 | docker-compose build 4 | docker-compose up 5 | 6 | down: 7 | docker-compose down 8 | docker network rm spark-net 9 | -------------------------------------------------------------------------------- /profiles: -------------------------------------------------------------------------------- 1 | { 2 | "standalone" : { 3 | "id" : "Local Spark Cluster", 4 | "name" : "Local Spark Cluster", 5 | "template" : { 6 | "customLocalRepo" : null, 7 | "customRepos" : null, 8 | "customDeps" : null, 9 | "customImports" : null, 10 | "customSparkConf" : { 11 | "spark.app.name" : "Notebook", 12 | "spark.master" : "spark://localhost:7077", 13 | "spark.executor.memory" : "5G" 14 | } 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /clusters: -------------------------------------------------------------------------------- 1 | { 2 | "Spark" : { 3 | "profile" : "Local Spark Cluster", 4 | "name" : "Local Spark", 5 | "status" : "stopped", 6 | "template" : { 7 | "customLocalRepo" : null, 8 | "customRepos" : null, 9 | "customDeps" : null, 10 | "customImports" : null, 11 | "customSparkConf" : { 12 | "spark.app.name" : "Notebook", 13 | "spark.master" : "spark://localhost:7077", 14 | "spark.executor.memory" : "4G" 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/big-data-europe/Lobby) 2 | 3 | # Docker for Spark Notebook 4 | 5 | Docker image for Spark Notebook 6 | 7 | Example usage: 8 | ``` 9 | version: "2" 10 | services: 11 | spark-notebook: 12 | image: bde2020/hadoop-spark-notebook:2.1.0-hadoop2.8-hive 13 | container_name: spark-notebook 14 | environment: 15 | - NOTEBOOKS_DIR=/data/notebooks 16 | env_file: 17 | - ./hadoop.env 18 | ports: 19 | - 9001:9001 20 | ``` 21 | 22 | ## Note 23 | For Spark docker see [BDE repository](https://github.com/big-data-europe/docker-spark) 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM bde2020/hadoop-base:1.1.0-hadoop2.8-java8 2 | MAINTAINER Ivan Ermilov 3 | 4 | ENV APACHE_SPARK_VERSION 2.1.0 5 | ENV APACHE_HADOOP_VERSION 2.8.0 6 | 7 | RUN set -x \ 8 | && curl -fSL "https://dl.dropboxusercontent.com/u/4882345/spark-notebook/spark-notebook-0.7.0-scala-2.11.8-spark-2.1.0-hadoop-2.8.0-with-hive.tar.gz" -o /tmp/spark-notebook.tgz \ 9 | && tar -xzvf /tmp/spark-notebook.tgz -C /opt/ \ 10 | && mv /opt/spark-notebook-* /opt/spark-notebook \ 11 | && rm /tmp/spark-notebook.tgz 12 | 13 | COPY run.sh /run.sh 14 | RUN chmod a+x /run.sh 15 | 16 | COPY application.conf /opt/spark-notebook/conf/ 17 | COPY clusters /opt/spark-notebook/conf/ 18 | COPY profiles /opt/spark-notebook/conf/ 19 | COPY jars /jars 20 | 21 | RUN mkdir -p /data/resources 22 | 23 | ENV NOTEBOOKS_DIR "/opt/spark-notebook/notebooks" 24 | ENV RESOURCES_DIR "/data/resources" 25 | ENV SPARK_MASTER "spark://spark-master:7077" 26 | ENV SPARK_EXECUTOR_MEMORY "4G" 27 | ENV EXTRA_CLASSPATH "/jars/*" 28 | 29 | WORKDIR /opt/spark-notebook/ 30 | 31 | CMD ["/run.sh"] 32 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.1" 2 | 3 | services: 4 | namenode: 5 | image: bde2020/hadoop-namenode:1.1.0-hadoop2.8-java8 6 | container_name: namenode 7 | volumes: 8 | - ./data/namenode:/hadoop/dfs/name 9 | environment: 10 | - CLUSTER_NAME=test 11 | - CORE_CONF_fs_defaultFS=hdfs://namenode:8020 12 | healthcheck: 13 | interval: 5s 14 | retries: 100 15 | networks: 16 | - spark-net 17 | datanode: 18 | image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8 19 | container_name: datanode 20 | volumes: 21 | - ./data/datanode:/hadoop/dfs/data 22 | environment: 23 | - CORE_CONF_fs_defaultFS=hdfs://namenode:8020 24 | depends_on: 25 | namenode: 26 | condition: service_healthy 27 | healthcheck: 28 | interval: 5s 29 | retries: 100 30 | networks: 31 | - spark-net 32 | spark-master: 33 | image: bde2020/spark-master:2.1.0-hadoop2.8-hive-java8 34 | container_name: spark-master 35 | ports: 36 | - "8080:8080" 37 | - "7077:7077" 38 | environment: 39 | - CORE_CONF_fs_defaultFS=hdfs://namenode:8020 40 | depends_on: 41 | namenode: 42 | condition: service_healthy 43 | datanode: 44 | condition: service_healthy 45 | healthcheck: 46 | interval: 5s 47 | retries: 100 48 | networks: 49 | - spark-net 50 | spark-worker: 51 | image: bde2020/spark-worker:2.1.0-hadoop2.8-hive-java8 52 | environment: 53 | - "SPARK_MASTER=spark://spark-master:7077" 54 | environment: 55 | - CORE_CONF_fs_defaultFS=hdfs://namenode:8020 56 | depends_on: 57 | spark-master: 58 | condition: service_healthy 59 | healthcheck: 60 | interval: 5s 61 | retries: 100 62 | networks: 63 | - spark-net 64 | spark-notebook: 65 | build: . 66 | ports: 67 | - 9001:9001 68 | depends_on: 69 | spark-master: 70 | condition: service_healthy 71 | namenode: 72 | condition: service_healthy 73 | networks: 74 | - spark-net 75 | 76 | networks: 77 | spark-net: 78 | external: 79 | name: spark-net 80 | -------------------------------------------------------------------------------- /application.conf: -------------------------------------------------------------------------------- 1 | # This is the main configuration file for the application. 2 | # ~~~~~ 3 | 4 | # Secret key 5 | # ~~~~~ 6 | # The secret key is used to secure cryptographics functions. 7 | # If you deploy your application to several instances be sure to use the same key! 8 | application.secret = "nTnOIy6^yFM5o[Z_T6jBriIYm7id43TSeLJC1U?bxt?PhfMJeCYX@s;RcNqX]xeA" 9 | 10 | # The application languages 11 | # ~~~~~ 12 | application.langs = "en" 13 | 14 | # Global object class 15 | # ~~~~~ 16 | # Define the Global object class for this application. 17 | # Default to Global in the root package. 18 | # application.global=Global 19 | 20 | # Router 21 | # ~~~~~ 22 | # Define the Router object to use for this application. 23 | # This router will be looked up first when the application is starting up, 24 | # so make sure this is the entry point. 25 | # Furthermore, it's assumed your route file is named properly. 26 | # So for an application router like `my.application.Router`, 27 | # you may need to define a router file `conf/my.application.routes`. 28 | # Default to Routes in the root package (and conf/routes) 29 | # application.router=my.application.Routes 30 | 31 | 32 | # Logger 33 | # ~~~~~ 34 | # You can also configure logback (http://logback.qos.ch/), 35 | # by providing an application-logger.xml file in the conf directory. 36 | 37 | # Root logger: 38 | logger.root = ERROR 39 | 40 | # Logger used by the framework: 41 | logger.play = INFO 42 | 43 | # Logger provided to your application: 44 | logger.application = DEBUG 45 | 46 | manager { 47 | notebooks { 48 | ### 49 | # Server dir (containing notebook files) 50 | dir = ./notebooks 51 | dir = ${?NOTEBOOKS_DIR} 52 | 53 | ### 54 | # Default custom configuration for all **CREATED** notebooks 55 | ### 56 | custom { 57 | sparkConf = { 58 | spark.app.name: "Notebook", 59 | spark.master: ${?SPARK_MASTER}, 60 | spark.executor.memory: ${?SPARK_EXECUTOR_MEMORY} 61 | } 62 | } 63 | ### 64 | # Override custom configuration for **ALL** notebooks 65 | # TO USE WITH CARE → could break full reproductability 66 | # 67 | override { 68 | sparkConf = { 69 | spark.app.name: "Notebook", 70 | spark.master: ${?SPARK_MASTER}, 71 | spark.executor.memory: ${?SPARK_EXECUTOR_MEMORY} 72 | } 73 | } 74 | } 75 | 76 | ### 77 | # Static resources to be made available on the web server 78 | # You may add your own resource directories 79 | # Paths may be relative to the server root, or absolute. 80 | resources=[${?RESOURCES_DIR}] 81 | 82 | ## 83 | # Name of SparkNotebook 84 | name = "Spark Notebook" 85 | 86 | ## 87 | # 88 | maxBytesInFlight = 5M 89 | 90 | kernel { 91 | ### 92 | # Uncomment to kill kernel after inactivity timeout 93 | # killTimeout = 60 minute 94 | 95 | ### 96 | # Uncomment to not to start the kernel (spark-context) automatically when notebook was open 97 | ## autostartOnNotebookOpen = false 98 | 99 | ### 100 | # Uncomment to enable remote vm debugging on the provided port 101 | # 102 | #debug.port=9090 103 | 104 | ### 105 | # Change the level of debug in the logs/sn-session-$kernelId-$notebookPath.log file 106 | # 107 | #log.level=debug 108 | 109 | ### 110 | # Add vmArgs to the remote process 111 | #vmArgs=["-XX:+PrintGCDetails", "-XX:+PrintGCDetails", "-Dsun.io.serialization.extendedDebugInfo=true"] 112 | 113 | ### 114 | # Working directory for kernel VMs 115 | #dir=. 116 | 117 | 118 | ### 119 | # List of URLs of kernel init scripts (to be run when a kernel first starts). 120 | #init=[] 121 | 122 | ### 123 | # Kernel VM memory settings 124 | #heap=4g 125 | #stack=-1 #default XSS 126 | permGen = 1024m 127 | #reservedCodeCache=-1 #default 128 | 129 | ### 130 | # Classpath for kernel VMs (defaults to server VM classpath) 131 | #classpath=[] 132 | 133 | ### 134 | # REPL compiler options: Use the deprecation warning by default for more 135 | # useful feedback about obsolete functions, etc. 136 | # REPL compiler options: Use the -deprecation warning by default for more 137 | # useful feedback about obsolete functions, etc. Use the -feature warning 138 | # for more explicit warnings about "optional" language features that 139 | # should be enabled explicitly. One of those that's used by Spark Notebook 140 | # itself is "reflective calls". 141 | compilerArgs=["-deprecation", "-feature", "-language:reflectiveCalls"] 142 | } 143 | 144 | clusters { 145 | profiles=/opt/spark-notebook/conf/profiles 146 | file=/opt/spark-notebook/conf/clusters 147 | } 148 | } 149 | 150 | notebook-server { 151 | akka { 152 | loggers = ["akka.event.slf4j.Slf4jLogger"] 153 | loglevel = "DEBUG" 154 | stdout-loglevel = "DEBUG" 155 | 156 | log-config-on-start = off 157 | 158 | daemonic = true 159 | 160 | debug { 161 | ## enable function of LoggingReceive, which is to log any received message at DEBUG level 162 | # receive = on 163 | ## enable DEBUG logging of all AutoReceiveMessages (Kill, PoisonPill and the like) 164 | # autoreceive = on 165 | ## enable DEBUG logging of actor lifecycle changes 166 | # lifecycle = on 167 | } 168 | 169 | actor { 170 | provider = "akka.remote.RemoteActorRefProvider" 171 | 172 | default-stash-dispatcher { 173 | mailbox-type = "akka.dispatch.UnboundedDequeBasedMailbox" 174 | } 175 | } 176 | 177 | remote { 178 | ## Debugging: 179 | # log-sent-messages = on 180 | # log-received-messages = on 181 | 182 | enabled-transports = ["akka.remote.netty.tcp"] 183 | # transport = "akka.remote.netty.NettyRemoteTransport" 184 | 185 | ## see (http://doc.akka.io/docs/akka/snapshot/scala/remoting.html) 186 | # These configuration will help getting of akka timeouts, specially 187 | # → threshold (12 for instance, it's refered as a good choice on ec2) 188 | # → heartbeat-interval (10s to reduce the number of comm between the server and the notebook backend) 189 | # → acceptable-heartbeat-pause (90s to reduce the number of comm between the server and the notebook backend) 190 | # 191 | #transport-failure-detector.heartbeat-interval = 4 s 192 | #transport-failure-detector.threshold = 7.0 # raise it to 12 for instance on EC2/Mesos/Yarn/... 193 | #transport-failure-detector.max-sample-size = 100 194 | #transport-failure-detector.min-std-deviation = 100 ms 195 | #transport-failure-detector.acceptable-heartbeat-pause = 10 s 196 | #watch-failure-detector.heartbeat-interval = 1 s 197 | #watch-failure-detector.threshold = 10.0 # raise it to 12 for instance on EC2/Mesos/Yarn/... 198 | #watch-failure-detector.max-sample-size = 200 199 | #watch-failure-detector.min-std-deviation = 100 ms 200 | #watch-failure-detector.acceptable-heartbeat-pause = 10 s 201 | #watch-failure-detector.unreachable-nodes-reaper-interval = 1s 202 | #watch-failure-detector.expected-response-after = 3 s 203 | 204 | netty.tcp { 205 | hostname = "127.0.0.1" 206 | port = 0 207 | 208 | maximum-frame-size = "1 GiB" 209 | } 210 | } 211 | } 212 | } 213 | 214 | # http { 215 | # proxyHost = ... 216 | # proxyPort = ... 217 | # proxyUser = ... 218 | # proxyPassword = ... 219 | # nonProxyHosts = ... 220 | # } 221 | 222 | remote-repos { 223 | proxy { 224 | # "protocol" = ..., 225 | # "host" = ..., 226 | # "port" = ..., 227 | # "username" = ..., 228 | # "password" = ..., 229 | # "nonProxyHosts" = ... 230 | } 231 | } 232 | --------------------------------------------------------------------------------