├── charts_v1 ├── spark-dashboard-0.3.0.tgz ├── Chart.yaml ├── templates │ ├── grafana_dashboards.yaml │ ├── influx_pv.yaml │ ├── grafana_service.yaml │ ├── influx_service.yaml │ ├── grafana_pod.yaml │ ├── _helpers.tpl │ ├── influx_graphiteconf.yaml │ ├── grafana_datasource.yaml │ └── influx_pod.yaml ├── values.yaml └── README.md ├── dockerfiles_v2 ├── grafana.ini ├── victoriametrics-metrics-datasource.yml ├── entrypoint.sh ├── spark.yaml ├── telegraf.conf ├── README.md └── Dockerfile ├── .devcontainer └── devcontainer.json ├── dockerfiles_v1 ├── spark.yaml ├── entrypoint.sh ├── influx.yaml ├── influxdb.conf ├── README.md └── Dockerfile ├── LICENSE └── README.md /charts_v1/spark-dashboard-0.3.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cerndb/spark-dashboard/HEAD/charts_v1/spark-dashboard-0.3.0.tgz -------------------------------------------------------------------------------- /charts_v1/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A performance dashboard for Apache Spark 4 | name: spark-dashboard 5 | version: 0.3.0 6 | -------------------------------------------------------------------------------- /charts_v1/templates/grafana_dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-dashboard 5 | data: 6 | {{- (.Files.Glob "grafana_dashboards/*").AsConfig | nindent 5 }} 7 | -------------------------------------------------------------------------------- /dockerfiles_v2/grafana.ini: -------------------------------------------------------------------------------- 1 | [plugins] 2 | allow_loading_unsigned_plugins = victoriametrics-metrics-datasource 3 | [dashboards] 4 | default_home_dashboard_path = /var/lib/grafana/dashboards/Spark_Perf_Dashboard_v04_PromQL.json 5 | 6 | -------------------------------------------------------------------------------- /dockerfiles_v2/victoriametrics-metrics-datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: VictoriaMetrics 5 | type: victoriametrics-metrics-datasource 6 | access: proxy 7 | url: http://localhost:8428 8 | isDefault: true 9 | 10 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostRequirements": { 3 | "cpus": 2, 4 | "memory": "8gb", 5 | "storage": "16gb" 6 | }, 7 | "postCreateCommand": "pip install pyspark sparkmeasure tpcds_pyspark", 8 | "extensions": ["ms-python.python"] 9 | } 10 | 11 | -------------------------------------------------------------------------------- /dockerfiles_v2/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start the services 4 | service grafana-server start 5 | service telegraf start 6 | ./victoria-metrics-prod 7 | 8 | # when running with docker run -d option this keeps the container running 9 | tail -f /dev/null 10 | 11 | 12 | -------------------------------------------------------------------------------- /dockerfiles_v1/spark.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: spark-dashboard 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: false 10 | editable: true 11 | updateIntervalSeconds: 10 12 | options: 13 | path: /var/lib/grafana/dashboards 14 | 15 | -------------------------------------------------------------------------------- /dockerfiles_v2/spark.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: spark-dashboard 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: false 10 | editable: true 11 | updateIntervalSeconds: 10 12 | options: 13 | path: /var/lib/grafana/dashboards 14 | 15 | -------------------------------------------------------------------------------- /dockerfiles_v1/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This takes care of changing ownership, useful when mounting 4 | # /var/lib/influxdb from an external volume 5 | chown -R influxdb:influxdb /var/lib/influxdb 6 | 7 | service influxdb start 8 | service grafana-server start 9 | 10 | # when running with docker run -d option this keeps the container running 11 | tail -f /dev/null 12 | -------------------------------------------------------------------------------- /charts_v1/templates/influx_pv.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.influxdb.storage.class }} 2 | apiVersion: v1 3 | kind: PersistentVolumeClaim 4 | metadata: 5 | name: {{ .Release.Name }}-influx 6 | {{- with .Values.influxdb.storage }} 7 | spec: 8 | storageClassName: {{ .class }} 9 | {{- if .zone }} 10 | selector: 11 | matchLabels: 12 | failure-domain.beta.kubernetes.io/zone: {{ .zone }} 13 | {{- end }} 14 | accessModes: 15 | - {{ .type }} 16 | resources: 17 | requests: 18 | storage: {{ .size }} 19 | {{- end }} 20 | {{- end }} 21 | -------------------------------------------------------------------------------- /charts_v1/templates/grafana_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Release.Name }}-grafana 5 | {{- with .Values.grafana }} 6 | spec: 7 | selector: 8 | app: grafana 9 | type: {{ .service.type }} 10 | {{- if and .service.lbSourceRange (eq .service.type "LoadBalancer") }} 11 | loadBalancerSourceRanges: 12 | - {{ .service.lbSourceRange }} 13 | {{- end }} 14 | ports: 15 | - name: grafana 16 | protocol: TCP 17 | port: {{ .service.port }} 18 | targetPort: {{ .service.targetPort }} 19 | {{- end }} 20 | 21 | -------------------------------------------------------------------------------- /charts_v1/values.yaml: -------------------------------------------------------------------------------- 1 | grafana: 2 | image: "grafana/grafana:10.4.0" 3 | service: 4 | type: "NodePort" 5 | port: 3000 6 | targetPort: 3000 7 | influxdb: 8 | image: "influxdb:1.8.10" 9 | disableReporting: "true" 10 | dbName: "graphite" 11 | service: 12 | # type: "LoadBalancer" 13 | # lbSourceRange: "128.141.0.0/16" 14 | type: "NodePort" 15 | influx: 16 | port: 8086 17 | targetPort: 8086 18 | graphite: 19 | port: 2003 20 | targetPort: 2003 21 | storage: 22 | # class: "MyStorageClass" 23 | # type: "ReadWriteOnce" 24 | size: "1Gi" 25 | 26 | -------------------------------------------------------------------------------- /charts_v1/templates/influx_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Release.Name }}-influx 5 | {{ with .Values.influxdb }} 6 | spec: 7 | selector: 8 | app: influx 9 | type: {{ .service.type }} 10 | {{- if and .service.lbSourceRange (eq .service.type "LoadBalancer") }} 11 | loadBalancerSourceRanges: 12 | - {{ .service.lbSourceRange }} 13 | {{- end }} 14 | ports: 15 | - protocol: TCP 16 | name: influx 17 | port: {{ .service.influx.port }} 18 | targetPort: {{ .service.influx.targetPort }} 19 | - protocol: TCP 20 | name: graphite 21 | port: {{ .service.graphite.port }} 22 | targetPort: {{ .service.graphite.targetPort }} 23 | {{- end }} 24 | -------------------------------------------------------------------------------- /dockerfiles_v1/influx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: influx-sparkmeasure 5 | type: influxdb 6 | access: proxy 7 | orgId: 1 8 | url: http://localhost:8086 9 | password: 10 | user: 11 | database: 12 | sparkmeasure 13 | basicAuth: 14 | basicAuthUser: 15 | basicAuthPassword: 16 | withCredentials: 17 | isDefault: 18 | version: 1 19 | editable: true 20 | - name: influx-graphite 21 | type: influxdb 22 | access: proxy 23 | orgId: 1 24 | url: http://localhost:8086 25 | password: 26 | user: 27 | database: 28 | graphite 29 | basicAuth: 30 | basicAuthUser: 31 | basicAuthPassword: 32 | withCredentials: 33 | isDefault: 34 | version: 1 35 | editable: true 36 | 37 | -------------------------------------------------------------------------------- /dockerfiles_v2/telegraf.conf: -------------------------------------------------------------------------------- 1 | [[inputs.socket_listener]] 2 | service_address = "tcp://:2003" 3 | data_format = "graphite" 4 | separator = "." 5 | templates = [ 6 | # JVM source 7 | "*.*.jvm.pools.* username.applicationid.executorid.namespace.namespace.measurement*", 8 | # YARN source 9 | "*.*.applicationMaster.* username.applicationid.namespace.measurement*", 10 | # shuffle service source 11 | "*.shuffleService.* username.namespace.measurement*", 12 | # streaming 13 | "*.*.*.spark.streaming.* username.applicationid.executorid.namespace.namespace.id.measurement*", 14 | # generic template for driver and executor sources 15 | "username.applicationid.executorid.namespace.measurement*" ] 16 | 17 | [[outputs.http]] 18 | ## URL is the address to send metrics to 19 | url = "http://localhost:8428/api/v1/write" 20 | method = "POST" 21 | data_format = "prometheusremotewrite" 22 | tagexclude = ["host", "namespace"] 23 | 24 | # Configure if needed 25 | #[agent] 26 | # interval = "10s" 27 | # flush_interval = "10s" 28 | # flush_jitter = "0s" 29 | 30 | -------------------------------------------------------------------------------- /dockerfiles_v1/influxdb.conf: -------------------------------------------------------------------------------- 1 | # influxdb conf 2 | 3 | [meta] 4 | dir = "/var/lib/influxdb/meta" 5 | 6 | [data] 7 | dir = "/var/lib/influxdb/data" 8 | engine = "tsm1" 9 | wal-dir = "/var/lib/influxdb/wal" 10 | 11 | # Note Grafana http endpoint is on port 8086 by default. 12 | 13 | [[graphite]] 14 | enabled = true 15 | bind-address = ":2003" 16 | database = "graphite" 17 | retention-policy = "" 18 | protocol = "tcp" 19 | batch-size = 5000 20 | batch-pending = 10 21 | batch-timeout = "1s" 22 | consistency-level = "one" 23 | separator = "." 24 | udp-read-buffer = 0 25 | templates = [ 26 | # JVM source 27 | "*.*.jvm.pools.* username.applicationid.process.namespace.namespace.measurement*", 28 | # YARN source 29 | "*.*.applicationMaster.* username.applicationid.namespace.measurement*", 30 | # shuffle service source 31 | "*.shuffleService.* username.namespace.measurement*", 32 | # streaming 33 | "*.*.*.spark.streaming.* username.applicationid.process.namespace.namespace.id.measurement*", 34 | # generic template for driver and executor sources 35 | "username.applicationid.process.namespace.measurement*" ] 36 | -------------------------------------------------------------------------------- /charts_v1/templates/grafana_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ .Release.Name }}-grafana 5 | labels: 6 | app: grafana 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: grafana 12 | template: 13 | metadata: 14 | labels: 15 | app: grafana 16 | spec: 17 | containers: 18 | - name: grafana 19 | image: {{ .Values.grafana.image }} 20 | volumeMounts: 21 | - name: datasource-conf 22 | mountPath: /etc/grafana/provisioning/datasources/influx.yaml 23 | subPath: influx-datasource-config 24 | - name: datasource-conf 25 | mountPath: /etc/grafana/provisioning/dashboards/spark.yaml 26 | subPath: spark-dashboard-config 27 | - name: spark-dashboard 28 | mountPath: /var/lib/grafana/dashboards 29 | ports: 30 | - containerPort: {{ .Values.grafana.service.port }} 31 | volumes: 32 | - name: datasource-conf 33 | configMap: 34 | name: {{ .Release.Name }}-datasource 35 | - name: spark-dashboard 36 | configMap: 37 | name: {{ .Release.Name }}-dashboard 38 | -------------------------------------------------------------------------------- /charts_v1/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "spark_dashboard.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "spark_dashboard.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "spark_dashboard.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | -------------------------------------------------------------------------------- /dockerfiles_v1/README.md: -------------------------------------------------------------------------------- 1 | # How to build and run the legacy (v01) Spark dashboard in a container image 2 | 3 | ## How to run 4 | Run the dashboard v01 using a container image from [Dockerhub](https://hub.docker.com/r/lucacanali/spark-dashboard): 5 | - There are a few ports needed and multiple options on how to expose them 6 | - Port 2003 is for Graphite ingestion, port 3000 is for Grafana, port 8086 is used internally by the Grafana source 7 | - You can expose the ports from the container individually or just make `network=host`. 8 | - Examples: 9 | ``` 10 | docker run --network=host -d lucacanali/spark-dashboard:v01 11 | or 12 | docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard:v01 13 | or 14 | docker run -p 3000:3000 -p 2003:2003 -p 8086:8086 -d lucacanali/spark-dashboard:v01 15 | ``` 16 | 17 | ## Advanced: persist InfluxDB data across restarts 18 | - This shows an example of how to use a volume to store InfluxDB data. 19 | It allows preserving the history across runs when the container is restarted, 20 | otherwise InfluxDB starts from scratch each time. 21 | ``` 22 | docker run --network=host -v MYPATH/myinfluxdir:/var/lib/influxdb -d lucacanali/spark-dashboard:v01 23 | ``` 24 | 25 | ## How to build the image: 26 | ``` 27 | docker build -t spark-dashboard:v01 . 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /dockerfiles_v1/Dockerfile: -------------------------------------------------------------------------------- 1 | # Container image for Spark Dashboard 2 | # using InfluxDB and Grafana 3 | 4 | FROM ubuntu:22.04 5 | 6 | ENV INFLUXDB_VERSION 1.8.10 7 | ENV GRAFANA_VERSION 10.4.0 8 | ENV ARCH amd64 9 | 10 | RUN set -ex && \ 11 | apt-get update && \ 12 | apt-get install -qq -y curl libfontconfig musl && \ 13 | curl -O https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ 14 | dpkg -i grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ 15 | rm -f grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ 16 | curl -O https://dl.influxdata.com/influxdb/releases/influxdb_${INFLUXDB_VERSION}_${ARCH}.deb && \ 17 | dpkg -i influxdb_${INFLUXDB_VERSION}_${ARCH}.deb && \ 18 | rm -f influxdb_${INFLUXDB_VERSION}_${ARCH}.deb 19 | 20 | COPY influxdb.conf /etc/influxdb/influxdb.conf 21 | COPY --chown=grafana:grafana grafana_dashboards /var/lib/grafana/dashboards 22 | COPY --chown=grafana:grafana influx.yaml /etc/grafana/provisioning/datasources/influx.yaml 23 | COPY --chown=grafana:grafana spark.yaml /etc/grafana/provisioning/dashboards/spark.yaml 24 | COPY entrypoint.sh /opt/entrypoint.sh 25 | 26 | # expose grafana dashboard (3000) and influxdb graphite (2003) and http (8086) endpoints 27 | EXPOSE 3000/tcp 2003/tcp 8086/tcp 28 | 29 | WORKDIR / 30 | ENTRYPOINT [ "/opt/entrypoint.sh" ] 31 | -------------------------------------------------------------------------------- /charts_v1/templates/influx_graphiteconf.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-config 5 | data: 6 | influx-graphite-config: | 7 | [meta] 8 | dir = "/var/lib/influxdb/meta" 9 | 10 | [data] 11 | dir = "/var/lib/influxdb/data" 12 | engine = "tsm1" 13 | wal-dir = "/var/lib/influxdb/wal" 14 | 15 | [[graphite]] 16 | enabled = true 17 | bind-address = ":2003" 18 | database = "graphite" 19 | retention-policy = "" 20 | protocol = "tcp" 21 | batch-size = 5000 22 | batch-pending = 10 23 | batch-timeout = "1s" 24 | consistency-level = "one" 25 | separator = "." 26 | udp-read-buffer = 0 27 | templates = [ 28 | # JVM source 29 | "*.*.jvm.pools.* username.applicationid.process.namespace.namespace.measurement*", 30 | # YARN source 31 | "*.*.applicationMaster.* username.applicationid.namespace.measurement*", 32 | # shuffle service source 33 | "*.shuffleService.* username.namespace.measurement*", 34 | # streaming 35 | "*.*.*.spark.streaming.* username.applicationid.process.namespace.namespace.id.measurement*", 36 | # generic template for driver and executor sources 37 | "username.applicationid.process.namespace.measurement*" ] 38 | -------------------------------------------------------------------------------- /charts_v1/templates/grafana_datasource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-datasource 5 | data: 6 | influx-datasource-config: | 7 | apiVersion: 1 8 | 9 | datasources: 10 | - name: influx-sparkmeasure 11 | type: influxdb 12 | access: proxy 13 | orgId: 1 14 | url: http://{{ .Release.Name }}-influx:8086 15 | password: 16 | user: 17 | database: 18 | sparkmeasure 19 | basicAuth: 20 | basicAuthUser: 21 | basicAuthPassword: 22 | withCredentials: 23 | isDefault: 24 | version: 1 25 | editable: true 26 | - name: influx-graphite 27 | type: influxdb 28 | access: proxy 29 | orgId: 1 30 | url: http://{{ .Release.Name }}-influx:8086 31 | password: 32 | user: 33 | database: 34 | graphite 35 | basicAuth: 36 | basicAuthUser: 37 | basicAuthPassword: 38 | withCredentials: 39 | isDefault: 40 | version: 1 41 | editable: true 42 | spark-dashboard-config: | 43 | apiVersion: 1 44 | 45 | providers: 46 | - name: spark-dashboard 47 | orgId: 1 48 | folder: '' 49 | folderUid: '' 50 | type: file 51 | disableDeletion: false 52 | editable: true 53 | updateIntervalSeconds: 10 54 | options: 55 | path: /var/lib/grafana/dashboards 56 | -------------------------------------------------------------------------------- /charts_v1/templates/influx_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ .Release.Name }}-influx 5 | labels: 6 | app: influx 7 | {{- with .Values.influxdb }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: influx 13 | template: 14 | metadata: 15 | labels: 16 | app: influx 17 | spec: 18 | containers: 19 | - name: influx 20 | image: {{ .image }} 21 | ports: 22 | - containerPort: {{ .service.graphite.port }} 23 | - containerPort: {{ .service.influx.port }} 24 | volumeMounts: 25 | - name: influx-data 26 | mountPath: /var/lib/influxdb 27 | - name: graphite-config 28 | mountPath: /etc/influxdb/influxdb.conf 29 | subPath: influx-graphite-config 30 | env: 31 | - name: INFLUXDB_REPORTING_DISABLED 32 | value: {{ .disableReporting | quote}} 33 | - name: INFLUXDB_DB 34 | value: {{ .dbName | quote }} 35 | {{- end }} 36 | volumes: 37 | - name: graphite-config 38 | configMap: 39 | name: {{ .Release.Name }}-config 40 | - name: influx-data 41 | {{- if .Values.influxdb.storage.class }} 42 | persistentVolumeClaim: 43 | claimName: {{ .Release.Name }}-influx 44 | {{- else }} 45 | emptyDir: {} 46 | {{- end }} 47 | 48 | -------------------------------------------------------------------------------- /dockerfiles_v2/README.md: -------------------------------------------------------------------------------- 1 | # How to build and run the Spark dashboard in a container image 2 | 3 | ## How to run 4 | Run the dashboard using a container image from [Dockerhub](https://hub.docker.com/r/lucacanali/spark-dashboard): 5 | - There are a few ports needed and multiple options on how to expose them 6 | - Port 2003 is for Graphite ingestion, port 3000 is for Grafana, port 8428 is used internally by VictoriaMetrics source 7 | - You can expose the ports from the container individually or just make `network=host`. 8 | - Examples: 9 | ``` 10 | docker run --network=host -d lucacanali/spark-dashboard 11 | or 12 | docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard 13 | or 14 | docker run -p 3000:3000 -p 2003:2003 -p 8428:8428 -d lucacanali/spark-dashboard 15 | ``` 16 | 17 | ## Persisting VictoriaMetrics Data Across Restarts 18 | By default, VictoriaMetrics does not retain data between container restarts—each time the container starts, it begins with an empty dataset. 19 | To preserve historical metrics, you need to mount a persistent volume for data storage. 20 | 21 | Below is an example of how to do this using a local directory: 22 | 23 | ``` 24 | # Create a directory to store VictoriaMetrics data 25 | mkdir metrics_data 26 | 27 | # Run the container with the local directory mounted as the data volume. 28 | # This ensures your metrics history survives container restarts. 29 | docker run --network=host \ 30 | -v ./metrics_data:/victoria-metrics-data \ 31 | -d lucacanali/spark-dashboard:v02 32 | ``` 33 | 34 | ## Example of how to build the image: 35 | ``` 36 | cd dockerfiles_v2 37 | docker build -t spark-dashboard:v02 . 38 | ``` 39 | 40 | -------------------------------------------------------------------------------- /charts_v1/README.md: -------------------------------------------------------------------------------- 1 | # How to install the Helm Chart for the Spark Dashboard 2 | 3 | The Helm chart is installed using [helm](https://helm.sh/docs/intro/quickstart/): 4 | ``` 5 | helm install spark-dashboard https://github.com/cerndb/spark-dashboard/raw/master/charts/spark-dashboard-0.3.0.tgz 6 | ``` 7 | 8 | Other installation options: 9 | 10 | ``` 11 | # Install from source. 12 | # Prerequisite: download the repo and cd into the charts directory 13 | helm install spark-dashboard -f values.yaml . 14 | ``` 15 | 16 | ``` 17 | # Re-package and install 18 | helm package . 19 | helm install spark-dashboard spark-dashboard-0.3.0.tgz 20 | ``` 21 | 22 | Additional admin commands: 23 | ``` 24 | # Update the chart (after repackaging) 25 | helm upgrade --install spark-dashboard spark-dashboard-0.3.0.tgz 26 | 27 | # uninstall 28 | helm uninstall spark-dashboard 29 | 30 | # list and display installed components 31 | help list 32 | kubectl get service spark-dashboard-grafana spark-dashboard-influx 33 | kubectl get pods |grep spark-dashboard 34 | kubectl get configmaps |grep spark-dashboard 35 | ``` 36 | 37 | ## Configuration options 38 | 39 | The provided configuration is for testing purposes, for production use you may need further configuration, as typical for these type of components. 40 | - The storage for influxDB can be defined in the `values.yaml` 41 | - If no storageClass is provided, an `EmptyDir` will be allocated: the dashboard history will be lost when the 42 | underlying pod is restarted. You may rather want to use a persistent backend in the configuration. 43 | - The services exposed by `grafana` and `influx` in the example are of type `NodePort`. You can use `LoadBalancer` type if your Kubernetes distribution supports it. 44 | -------------------------------------------------------------------------------- /dockerfiles_v2/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | 3 | ENV TELEGRAF_VERSION 1.37.0-1 4 | ENV GRAFANA_VERSION 12.3.0 5 | ENV VM_VERSION v1.131.0 6 | ENV ARCH amd64 7 | ENV GRAFANA_VM_PLUGIN_VERSION v0.19.7 8 | ENV PLUGIN_PATH /var/lib/grafana/plugins 9 | 10 | # Download and install Grafana 11 | RUN set -ex && \ 12 | apt-get update && \ 13 | apt-get install -qq -y curl libfontconfig musl adduser && \ 14 | curl -O https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ 15 | dpkg -i grafana_${GRAFANA_VERSION}_${ARCH}.deb && \ 16 | rm -f grafana_${GRAFANA_VERSION}_${ARCH}.deb 17 | 18 | # Copy the bundled dashboards for the spark-dashboard 19 | COPY grafana_dashboards /var/lib/grafana/dashboards 20 | COPY spark.yaml /etc/grafana/provisioning/dashboards/spark.yaml 21 | 22 | # Install and configure Grafana datasource for VictoriaMetric 23 | RUN set -ex && \ 24 | curl -L -O https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/download/${GRAFANA_VM_PLUGIN_VERSION}/victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \ 25 | tar -xzf victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \ 26 | find victoriametrics-metrics-datasource -type f -name "victoriametrics_backend_plugin*" ! -name "*linux_amd64" -exec rm -f {} + && \ 27 | mkdir ${PLUGIN_PATH} && \ 28 | mv victoriametrics-metrics-datasource ${PLUGIN_PATH} && \ 29 | rm victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz 30 | 31 | COPY grafana.ini /etc/grafana/grafana.ini 32 | COPY victoriametrics-metrics-datasource.yml /etc/grafana/provisioning/datasources/victoriametrics-metrics-datasource.yml 33 | 34 | # Install and configure Telegraf 35 | RUN set -ex && \ 36 | curl -O https://repos.influxdata.com/debian/packages/telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \ 37 | dpkg -i telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \ 38 | rm -f telegraf_${TELEGRAF_VERSION}_${ARCH}.deb 39 | 40 | COPY telegraf.conf /etc/telegraf/telegraf.conf 41 | 42 | # Download and install VictoriaMetrics (VM) 43 | RUN set -ex && \ 44 | curl -L -O https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz && \ 45 | tar -xzvf victoria-metrics-*.tar.gz && \ 46 | rm -f victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz 47 | 48 | # Copy the entrypoint script, it contains the startup commands 49 | COPY entrypoint.sh /opt/entrypoint.sh 50 | 51 | # Expose the ports for Grafana, Telegraf and VictoriaMetrics 52 | EXPOSE 3000/tcp 2003/tcp 8428/tcp 53 | 54 | WORKDIR / 55 | ENTRYPOINT [ "/opt/entrypoint.sh" ] 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark-Dashboard 2 | Real-Time Spark Monitoring & Performance Troubleshooting 3 | 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14718682.svg)](https://doi.org/10.5281/zenodo.14718682) 5 | [![Docker Pulls](https://img.shields.io/docker/pulls/lucacanali/spark-dashboard)](https://hub.docker.com/r/lucacanali/spark-dashboard) 6 | 7 | **Spark-Dashboard** offers a simple, intuitive interface for real-time monitoring of Apache Spark clusters. 8 | It visualizes key metrics, CPU, memory, task throughput, I/O, and more, as time series, making it easy to track trends, spot issues, 9 | and analyze workload evolution. 10 | Ideal for engineers and data teams, Spark-Dashboard streamlines Spark troubleshooting and root cause analysis. 11 | 12 | ## Key Features 13 | 14 | - **Real-time Performance Monitoring:** 15 | Visualize the evolution of Spark and system metrics, including CPU, memory, active tasks, and I/O, over time. Instantly spot trends and anomalies. 16 | 17 | - **Real-Time Visualization:** 18 | Integrated with Grafana for dynamic, interactive visualizations, enabling fast and effective performance analysis. 19 | 20 | - **Broad Compatibility:** 21 | Works with all major Apache Spark versions (4.x, 3.x) and across diverse cluster environments: Hadoop, Kubernetes, and Spark Standalone. 22 | 23 | ### Contents 24 | - [Architecture](#architecture) 25 | - [How To Deploy the Spark Dashboard V2](#how-to-deploy-the-spark-dashboard) 26 | - [How to run the Spark Dashboard V2 on a container](#how-to-run-the-spark-dashboard-v2-on-a-container) 27 | - [Persisting metric storage across container restarts](https://github.com/cerndb/spark-dashboard#persisting-victoriametrics-data-across-restarts) 28 | - [Extended Spark dashboard](#extended-spark-dashboard) 29 | - [Notes on Running Spark Dashboard on Spark Connect](#notes-on-running-spark-dashboard-on-spark-connect) 30 | - [Examples and getting started with Spark Performance dashboards](#examples-and-getting-started-with-spark-performance-dashboards) 31 | - [Start small, testing with Spark in local mode](#start-small-testing-with-spark-in-local-mode) 32 | - [Measuring with Spark Dashboard while running TPCDS on a Spark cluster](#running-tpcds-on-a-spark-cluster) 33 | - [Old implementation (V1)](#old-implementation-v1) 34 | - [How to run the Spark dashboard V1 on a container](#how-to-run-the-spark-dashboard-v1-on-a-container) 35 | - [How to run the dashboard V1 on Kubernetes using Helm](#how-to-run-the-dashboard-v1-on-kubernetes-using-helm) 36 | - [Advanced configurations and notes](#advanced-configurations-and-notes) 37 | 38 | ### Resources 39 | - [![Watch the video](https://www.youtube.com/s/desktop/050e6796/img/favicon_32x32.png) Watch Spark-Dashboard demo and tutorial](https://www.youtube.com/watch?v=sLjAyDwpg80) 40 | - Notes on [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard) 41 | - Blog on [building an Apache Spark Performance Lab](https://db-blog.web.cern.ch/node/195) 42 | - Blog [on Spark Dashboard](https://db-blog.web.cern.ch/blog/luca-canali/2019-02-performance-dashboard-apache-spark) 43 | - Talk on Spark performance at [Data+AI Summit 2021](https://databricks.com/session_na21/monitor-apache-spark-3-on-kubernetes-using-metrics-and-plugins), [slides](http://canali.web.cern.ch/docs/Monitor_Spark3_on_Kubernetes_DataAI2021_LucaCanali.pdf) 44 | - [sparkMeasure](https://github.com/LucaCanali/sparkMeasure) a tool for performance troubleshooting of Apache Spark workloads 45 | - [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) a TPC-DS workload generator written in Python and designed to run at scale using Apache Spark 46 | 47 | Main author and contact: Luca.Canali@cern.ch 48 | 49 | --- 50 | ## Architecture 51 | 52 | ![Spark metrics dashboard architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_MetricsSystem_Grafana_Dashboard_V2.0.png "Spark metrics dashboard architecture") 53 | 54 | This diagram illustrates an end-to-end monitoring pipeline for Apache Spark built entirely on open-source components. 55 | The architecture is designed to deliver real-time insights into the performance and health of your Spark clusters through 56 | a seamless flow of data from metric generation to visualization. 57 | 58 | - **Apache Spark Metrics:** 59 | Apache Spark generates detailed performance metrics via its [metrics system](https://spark.apache.org/docs/latest/monitoring.html#metrics). 60 | Both the driver and executors emit a wide range of metrics—such as runtime, CPU usage, garbage collection (GC) time, memory utilization, shuffle statistics, and I/O metrics—in Graphite format. 61 | 62 | - **Telegraf:** 63 | Acting as the collection agent, Telegraf ingests the metrics emitted by Spark. It enriches these measurements with additional 64 | labels and tags to facilitate effective organization and analysis before forwarding them to the storage backend. 65 | 66 | - **VictoriaMetrics:** 67 | This robust time-series database efficiently stores the labeled metrics data. Its design is optimized for handling large volumes 68 | of timestamped, sequential data, making it ideal for monitoring and historical trend analysis. 69 | 70 | - **Grafana:** 71 | Grafana provides a dynamic visualization layer, querying VictoriaMetrics using PromQL/MetricsQL. The result is a set of interactive 72 | dashboards that display real-time metrics and trends, empowering users to monitor system performance and swiftly identify any bottlenecks. 73 | 74 | Together, these components form a cohesive and scalable monitoring solution tailored for Apache Spark environments. 75 | 76 | --- 77 | ## How To Deploy the Spark Dashboard 78 | 79 | This quickstart guide presents multiple methods for deploying Spark Dashboard. The **recommended** approach is to deploy 80 | Spark-Dashboard v2 using a container. 81 | 82 | ### How to run the Spark Dashboard V2 on a container 83 | Follow these steps to get started with the container image: 84 | 85 | #### 1. Start the container 86 | The provided container image is pre-configured to run VictoriaMetrics (for metrics storage) and Grafana (for visualization). 87 | You can start the container using either Docker or Podman: 88 | 89 | - **Using Docker:** 90 | 91 | `docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard` 92 | 93 | - **Using Podman:** 94 | 95 | `podman run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard` 96 | 97 | #### 2. Configure Apache Spark 98 | 99 | To send metrics from Spark to the dashboard, configure Spark to export its metrics to the Graphite endpoint provided by the container. 100 | 101 | **Method A: Using `metrics.properties`** 102 | 103 | Edit the `metrics.properties` file located in `$SPARK_CONF_DIR` and add the following configuration: 104 | 105 | # Configure Graphite sink for Spark metrics 106 | *.sink.graphite.host=localhost 107 | *.sink.graphite.port=2003 108 | *.sink.graphite.period=10 109 | *.sink.graphite.unit=seconds 110 | *.sink.graphite.prefix=lucatest 111 | 112 | # Enable JVM metrics collection 113 | *.source.jvm.class=org.apache.spark.metrics.source.JvmSource 114 | 115 | Optionally, add these settings to your Spark launch configuration (or `spark-defaults.conf`): 116 | 117 | --conf spark.metrics.staticSources.enabled=true 118 | --conf spark.metrics.appStatusSource.enabled=true 119 | 120 | **Method B: Passing Configuration via Command-Line** 121 | 122 | Alternatively, you can specify Spark metrics settings directly when launching your Spark application. For example: 123 | 124 | # Define the VictoriaMetrics Graphite endpoint (replace `hostname` with your actual host if needed) 125 | VICTORIAMETRICS_ENDPOINT=$(hostname) 126 | 127 | bin/spark-shell \ 128 | --conf "spark.metrics.conf.*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink" \ 129 | --conf "spark.metrics.conf.*.sink.graphite.host=${VICTORIAMETRICS_ENDPOINT}" \ 130 | --conf "spark.metrics.conf.*.sink.graphite.port=2003" \ 131 | --conf "spark.metrics.conf.*.sink.graphite.period=10" \ 132 | --conf "spark.metrics.conf.*.sink.graphite.unit=seconds" \ 133 | --conf "spark.metrics.conf.*.sink.graphite.prefix=lucatest" \ 134 | --conf "spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource" \ 135 | --conf "spark.metrics.staticSources.enabled=true" \ 136 | --conf "spark.metrics.appStatusSource.enabled=true" 137 | 138 | *Optional:* To also collect and display "Tree Process Memory Details", add: 139 | 140 | --conf spark.executor.processTreeMetrics.enabled=true 141 | 142 | #### 3. Visualize Metrics in Grafana 143 | 144 | Once the container is running and Spark is configured to export metrics, you can view the performance data through Grafana: 145 | 146 | - **Access Grafana:** 147 | Open your web browser and navigate to [http://localhost:3000](http://localhost:3000) (replace `localhost` with your server's address if necessary). 148 | 149 | - **Login Credentials:** 150 | Use the default credentials: 151 | **User:** `admin` 152 | **Password:** `admin` 153 | 154 | - **Dashboard Overview:** 155 | The bundled dashboard (**Spark_Perf_Dashboard_v04_promQL**) displays a summary of key metrics (such as runtime, CPU usage, I/O, shuffle, task counts, etc.) along with detailed timeseries graphs. Select the appropriate username, application ID, and time range (default is the last 5 minutes) to customize your view. 156 | 157 | > **Important:** 158 | > Ensure that you have a running Spark application configured as detailed above so that metrics are available for selection and display. 159 | 160 | For testing purposes, you can generate load on Spark using [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark), a TPC-DS workload generator written in Python and designed to run at scale with Apache Spark. 161 | 162 | ---- 163 | ### Persisting VictoriaMetrics Data Across Restarts 164 | 165 | By default, VictoriaMetrics does not retain data between container restarts—each time the container starts, it begins with an empty dataset. 166 | To preserve historical metrics, you need to mount a persistent volume for data storage. 167 | 168 | Below is an example of how to do this using a local directory: 169 | 170 | ``` 171 | # Create a directory to store VictoriaMetrics data 172 | mkdir metrics_data 173 | 174 | # Run the container with the local directory mounted as the data volume. 175 | # This ensures your metrics history survives container restarts. 176 | docker run --network=host \ 177 | -v ./metrics_data:/victoria-metrics-data \ 178 | -d lucacanali/spark-dashboard:v02 179 | ``` 180 | 181 | --- 182 | ### Extended Spark Dashboard 183 | 184 | Enhance your monitoring capabilities with the Extended Spark Dashboard, which collects and visualizes OS and storage metrics alongside standard Spark performance data. This enhanced pipeline leverages [Spark Plugins](https://github.com/cerndb/SparkPlugins) to gather additional metrics, all stored within the same VictoriaMetrics database as the standard Spark metrics. 185 | 186 | #### Additional Dashboard Features 187 | 188 | The extended dashboard introduces three extra groups of graphs beyond those available in the standard Spark Dashboard: 189 | 190 | - **CGroup Metrics** 191 | Collects data via CGroup instrumentation—ideal for Spark running on Kubernetes. 192 | 193 | - **Cloud Storage** 194 | Displays metrics from block storage systems such as S3A, GZ, WASB, and other cloud storage services. 195 | 196 | - **HDFS Advanced Statistics** 197 | Provides deeper insights into HDFS usage, offering additional performance metrics when Spark leverages HDFS. 198 | 199 | #### Configuration 200 | 201 | To enable extended metrics, add the following configurations to your Spark setup: 202 | 203 | --conf ch.cern.sparkmeasure:spark-plugins_2.12:0.4 204 | --conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics 205 | 206 | #### Using the Extended Dashboard 207 | 208 | After configuring Spark, select the extended dashboard in Grafana to view the additional metrics: 209 | 210 | - **Dashboard Name:** `Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins` 211 | - The dashboard includes extra graphs for OS and storage metrics, offering a comprehensive view of your system's performance. 212 | 213 | ---- 214 | ### Notes on Running Spark Dashboard on Spark Connect 215 | 216 | [Spark Connect](https://spark.apache.org/docs/latest/spark-connect-overview.html) allows you to run a lightweight Spark client that connects remotely to your Spark cluster. 217 | When using Spark Connect, **Spark-Dashboard must be started on the Spark Connect server**, not on the client. Follow these steps: 218 | 219 | 1. **Start the Spark-Dashboard container** 220 | (see instructions above). 221 | 2. **Edit the `metrics.properties` file** 222 | in the Spark Connect `conf` directory as described above. 223 | 3. **Start Spark Connect** 224 | ``` 225 | sbin/start-connect-server.sh 226 | ``` 227 | Metrics from Spark Connect will now be sent to the Spark-Dashboard container and visualized in Grafana. 228 | 229 | ----- 230 | ## Examples and getting started with Spark Performance dashboards: 231 | - See some [examples of the dashboard graphs at this link](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard#example-graphs) 232 | 233 | ### Start small, testing with Spark in local mode 234 | - You can use the [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) package to generate a TPC-DS workload and test the dashboard. 235 | - Run the following on local resources or cloud, for example use GitHub Codespaces from this repo 236 | - [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/cerndb/spark-dashboard) 237 | ``` 238 | # Install the tool and dependencies 239 | pip install pyspark 240 | pip install sparkmeasure 241 | pip install tpcds_pyspark 242 | 243 | # Download the test data 244 | wget https://sparkdltrigger.web.cern.ch/sparkdltrigger/TPCDS/tpcds_10.zip 245 | unzip -q tpcds_10.zip 246 | 247 | # 1. Run the tool for a minimal test 248 | tpcds_pyspark_run.py -d tpcds_10 -n 1 -r 1 --queries q1,q2 249 | 250 | # 2. Start the dashboard and visualize the metrics (use docker or podman) 251 | docker run -p 2003:2003 -p 3000:3000 -d lucacanali/spark-dashboard 252 | 253 | # 3. run the tpcds workload sending metrics to the dashboard 254 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py` 255 | spark-submit --master local[*] \ 256 | --conf "spark.metrics.conf.*.sink.graphite.class"="org.apache.spark.metrics.sink.GraphiteSink" \ 257 | --conf "spark.metrics.conf.*.sink.graphite.host"="localhost" \ 258 | --conf "spark.metrics.conf.*.sink.graphite.port"=2003 \ 259 | --conf "spark.metrics.conf.*.sink.graphite.period"=10 \ 260 | --conf "spark.metrics.conf.*.sink.graphite.unit"=seconds \ 261 | --conf "spark.metrics.conf.*.sink.graphite.prefix"="lucatest" \ 262 | --conf "spark.metrics.conf.*.source.jvm.class"="org.apache.spark.metrics.source.JvmSource" \ 263 | --conf "spark.metrics.staticSources.enabled"=true \ 264 | --conf "spark.metrics.appStatusSource.enabled"=true \ 265 | --conf spark.driver.memory=4g \ 266 | --conf spark.log.level=error \ 267 | --packages ch.cern.sparkmeasure:spark-measure_2.12:0.25 \ 268 | $TPCDS_PYSPARK -d tpcds_10 269 | 270 | # 4. Accessing the Grafana Dashboard: 271 | # - Navigate to https://localhost:3000 to access the Grafana dashboard. 272 | # - If using GitHub Codespaces, use the "Ports" tab to open a browser window for this address. 273 | # - Default credentials for Grafana are username: admin and password: admin. 274 | # - Optionally, open the Spark WebUI at http://localhost:4040 to monitor the Spark job. 275 | 276 | # Wait a few minutes for metrics to populate the dashboard. 277 | # Note: This dashboard is more effective when Spark runs on cluster resources 278 | # rather than in the local mode demonstrated here. For more details, refer to the next paragraph. 279 | ``` 280 | 281 | 282 | ### Running TPCDS on a Spark cluster 283 | - Example of running TPCDS on a YARN Spark cluster, monitor with the Spark dashboard: 284 | ``` 285 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py` 286 | 287 | spark-submit --master yarn --conf spark.log.level=error --conf spark.executor.cores=8 --conf spark.executor.memory=64g \ 288 | --conf spark.driver.memory=16g --conf spark.driver.extraClassPath=tpcds_pyspark/spark-measure_2.12-0.25.jar \ 289 | --conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=32 --conf spark.sql.shuffle.partitions=512 \ 290 | $TPCDS_PYSPARK -d hdfs:///tpcds_10000_parquet_1.13.1 291 | ``` 292 | 293 | - Example of running TPCDS on a Kubernetes cluster with S3 storage, monitor this with the extended dashboard using Spark plugins: 294 | ``` 295 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py` 296 | 297 | spark-submit --master k8s://https://xxx.xxx.xxx.xxx:6443 --conf spark.kubernetes.container.image=/spark:v3.5.1 --conf spark.kubernetes.namespace=xxx \ 298 | --conf spark.eventLog.enabled=false --conf spark.task.maxDirectResultSize=2000000000 --conf spark.shuffle.service.enabled=false --conf spark.executor.cores=8 --conf spark.executor.memory=32g --conf spark.driver.memory=4g \ 299 | --packages org.apache.hadoop:hadoop-aws:3.3.4,ch.cern.sparkmeasure:spark-measure_2.12:0.25,ch.cern.sparkmeasure:spark-plugins_2.12:0.4 --conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics \ 300 | --conf spark.cernSparkPlugin.cloudFsName=s3a \ 301 | --conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=4 \ 302 | --conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \ 303 | --conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \ 304 | --conf spark.hadoop.fs.s3a.endpoint="https://s3.cern.ch" \ 305 | --conf spark.hadoop.fs.s3a.impl="org.apache.hadoop.fs.s3a.S3AFileSystem" \ 306 | --conf spark.executor.metrics.fileSystemSchemes="file,hdfs,s3a" \ 307 | --conf spark.hadoop.fs.s3a.fast.upload=true \ 308 | --conf spark.hadoop.fs.s3a.path.style.access=true \ 309 | --conf spark.hadoop.fs.s3a.list.version=1 \ 310 | $TPCDS_PYSPARK -d s3a://luca/tpcds_100 311 | ``` 312 | 313 | --- 314 | ## Legacy implementation (spark-dashboard v1) 315 | 316 | Note: spark-dashboard v1 (the original implementation) uses InfluxDB as the time-series database, see also 317 | [spark-dashabord v1 architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_metrics_dashboard_arch.PNG) 318 | 319 | ### How to run the Spark dashboard V1 on a container 320 | This is the original implementation of the tool using InfluxDB and Grafana 321 | 322 | **1. Start the container** 323 | The provided container image has been built configured to run InfluxDB and Grafana 324 | -`docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard:v01` 325 | - Note: port 2003 is for Graphite ingestion, port 3000 is for Grafana 326 | - More options, including on how to persist InfluxDB data across restarts at: [Spark dashboard in a container](dockerfiles_v1) 327 | 328 | **2. Spark configuration** 329 | See above 330 | 331 | **3. Visualize the metrics using a Grafana dashboard** 332 | - Point your browser to `http://hostname:3000` (edit `hostname` as relevant) 333 | - See details above 334 | 335 | --- 336 | ### How to run the dashboard V1 on Kubernetes using Helm 337 | If you chose to run on Kubernetes, these are steps: 338 | 339 | 1. The Helm chart takes care of configuring and running InfluxDB and Grafana: 340 | - Quickstart: `helm install spark-dashboard https://github.com/cerndb/spark-dashboard/raw/master/charts/spark-dashboard-0.3.0.tgz` 341 | - Details: [charts](charts) 342 | 343 | 2. Spark configuration: 344 | - Configure `metrics.properties` as detailed above. 345 | - Use `INFLUXDB_ENDPOINT=spark-dashboard-influx.default.svc.cluster.local` as the InfluxDB endpoint in 346 | the Spark configuration. 347 | 348 | 3. Grafana's visualization with Helm: 349 | - The Grafana dashboard is reachable at port 3000 of the spark-dashboard-service. 350 | - See service details: `kubectl get service spark-dashboard-grafana` 351 | - When using NodePort and an internal cluster IP address, this is how you can port forward to the service from 352 | the local machine: `kubectl port-forward service/spark-dashboard-grafana 3000:3000` 353 | 354 | More info at [Spark dashboard on Kubernetes](charts/README.md) 355 | 356 | --- 357 | ## Advanced configurations and notes 358 | 359 | ### Graph annotations: display query/job/stage start and end times 360 | Optionally, you can add annotation instrumentation to the performance dashboard v1. 361 | Annotations provide additional info on start and end times for queries, jobs and stages. 362 | To activate annotations, add the following additional configuration to spark-submit/spark-shell/pyspark, 363 | needed for collecting and writing extra performance data: 364 | ``` 365 | INFLUXDB_HTTP_ENDPOINT="http://`hostname`:8086" 366 | 367 | 368 | --packages ch.cern.sparkmeasure:spark-measure_2.12:0.25 \ 369 | --conf spark.sparkmeasure.influxdbURL=$INFLUXDB_HTTP_ENDPOINT \ 370 | --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSink \ 371 | ``` 372 | 373 | ### Notes 374 | - More details on how this works and alternative configurations at [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard) 375 | - The dashboard can be used when running Spark on a cluster (Kubernetes, YARN, Standalone) or in local mode. 376 | - When using Spark in local mode, use Spark version 3.1 or higher, see [SPARK-31711](https://issues.apache.org/jira/browse/SPARK-31711) 377 | 378 | ### Docker / Podman 379 | - Telegraf will use port 2003 (graphite endpoint) and port 8428 (VictoriaMetrics source) of your machine/VM. 380 | - For dashboard v1: InfluxDB will use port 2003 (graphite endpoint), and port 8086 (http endpoint) of 381 | your machine/VM (when running using `--network=host`). 382 | - Note: the endpoints need to be available on the node where you started the container and 383 | reachable by Spark executors and driver (mind the firewall). 384 | 385 | ### Helm 386 | - Find the InfluxDB endpoint IP with `kubectl get service spark-dashboard-influx`. 387 | - Optionally, resolve the DNS name with `nslookup` of such IP. 388 | For example, the InfluxDB service host name of a test installation is: `spark-dashboard-influx.default.svc.cluster.local` 389 | 390 | ### Customizing and adding new dashboards 391 | 392 | - This implementation comes with some example dashboards. Note that only a subset of the 393 | metrics values logged into VictoriaMetrics are visualized in the provided dashboard. 394 | - For a full list of the available metrics see the [documentation of Spark metrics system](https://github.com/apache/spark/blob/master/docs/monitoring.md#metrics). 395 | - New dashboards can be added by putting them in the relevant `grafana_dashboards` folder and re-building the container image 396 | (or re-packaging the helm chart). 397 | - On Helm: running helm-update is enough to upload it as ConfigMap and make it available to Grafana. 398 | - Automatically persisting manual edits is not supported at this time. 399 | --------------------------------------------------------------------------------