├── charts_v1
    ├── spark-dashboard-0.3.0.tgz
    ├── Chart.yaml
    ├── templates
    │   ├── grafana_dashboards.yaml
    │   ├── influx_pv.yaml
    │   ├── grafana_service.yaml
    │   ├── influx_service.yaml
    │   ├── grafana_pod.yaml
    │   ├── _helpers.tpl
    │   ├── influx_graphiteconf.yaml
    │   ├── grafana_datasource.yaml
    │   └── influx_pod.yaml
    ├── values.yaml
    └── README.md
├── dockerfiles_v2
    ├── grafana.ini
    ├── victoriametrics-metrics-datasource.yml
    ├── entrypoint.sh
    ├── spark.yaml
    ├── telegraf.conf
    ├── README.md
    └── Dockerfile
├── .devcontainer
    └── devcontainer.json
├── dockerfiles_v1
    ├── spark.yaml
    ├── entrypoint.sh
    ├── influx.yaml
    ├── influxdb.conf
    ├── README.md
    └── Dockerfile
├── LICENSE
└── README.md


/charts_v1/spark-dashboard-0.3.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cerndb/spark-dashboard/HEAD/charts_v1/spark-dashboard-0.3.0.tgz


--------------------------------------------------------------------------------
/charts_v1/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A performance dashboard for Apache Spark
4 | name: spark-dashboard
5 | version: 0.3.0
6 | 


--------------------------------------------------------------------------------
/charts_v1/templates/grafana_dashboards.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: {{ .Release.Name }}-dashboard
5 | data:
6 | {{- (.Files.Glob "grafana_dashboards/*").AsConfig | nindent 5 }}
7 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/grafana.ini:
--------------------------------------------------------------------------------
1 | [plugins]
2 | allow_loading_unsigned_plugins = victoriametrics-metrics-datasource
3 | [dashboards]
4 | default_home_dashboard_path = /var/lib/grafana/dashboards/Spark_Perf_Dashboard_v04_PromQL.json
5 | 
6 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/victoriametrics-metrics-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |   - name: VictoriaMetrics
 5 |     type: victoriametrics-metrics-datasource
 6 |     access: proxy
 7 |     url: http://localhost:8428
 8 |     isDefault: true
 9 | 
10 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hostRequirements": {
 3 |     "cpus": 2,
 4 |     "memory": "8gb",
 5 |     "storage": "16gb"
 6 |   },
 7 |   "postCreateCommand": "pip install pyspark sparkmeasure tpcds_pyspark",
 8 |   "extensions": ["ms-python.python"]
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Start the services
 4 | service grafana-server start
 5 | service telegraf start
 6 | ./victoria-metrics-prod
 7 | 
 8 | # when running with docker run -d option this keeps the container running
 9 | tail -f /dev/null
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/spark.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: spark-dashboard
 5 |   orgId: 1
 6 |   folder: ''
 7 |   folderUid: ''
 8 |   type: file
 9 |   disableDeletion: false
10 |   editable: true
11 |   updateIntervalSeconds: 10
12 |   options:
13 |     path: /var/lib/grafana/dashboards
14 | 
15 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/spark.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: spark-dashboard
 5 |   orgId: 1
 6 |   folder: ''
 7 |   folderUid: ''
 8 |   type: file
 9 |   disableDeletion: false
10 |   editable: true
11 |   updateIntervalSeconds: 10
12 |   options:
13 |     path: /var/lib/grafana/dashboards
14 | 
15 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This takes care of changing ownership, useful when mounting
 4 | # /var/lib/influxdb from an external volume
 5 | chown -R influxdb:influxdb /var/lib/influxdb
 6 | 
 7 | service influxdb start
 8 | service grafana-server start
 9 | 
10 | # when running with docker run -d option this keeps the container running
11 | tail -f /dev/null
12 | 


--------------------------------------------------------------------------------
/charts_v1/templates/influx_pv.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.influxdb.storage.class }}
 2 | apiVersion: v1
 3 | kind: PersistentVolumeClaim
 4 | metadata:
 5 |   name: {{ .Release.Name }}-influx
 6 | {{- with .Values.influxdb.storage }}
 7 | spec:
 8 |   storageClassName: {{ .class }}
 9 |   {{- if .zone }}
10 |   selector:
11 |     matchLabels:
12 |       failure-domain.beta.kubernetes.io/zone: {{ .zone }}
13 |   {{- end }}
14 |   accessModes:
15 |     - {{ .type }}
16 |   resources:
17 |     requests:
18 |       storage: {{ .size }}
19 | {{- end }}
20 | {{- end }}
21 | 


--------------------------------------------------------------------------------
/charts_v1/templates/grafana_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ .Release.Name }}-grafana
 5 | {{- with .Values.grafana }}
 6 | spec:
 7 |   selector:
 8 |     app: grafana
 9 |   type: {{ .service.type }}
10 |   {{- if and .service.lbSourceRange (eq .service.type "LoadBalancer") }}
11 |   loadBalancerSourceRanges:
12 |   - {{ .service.lbSourceRange }} 
13 |   {{- end }}
14 |   ports:
15 |   - name: grafana
16 |     protocol: TCP
17 |     port: {{ .service.port }}
18 |     targetPort: {{ .service.targetPort }}
19 | {{- end }}
20 | 
21 | 


--------------------------------------------------------------------------------
/charts_v1/values.yaml:
--------------------------------------------------------------------------------
 1 | grafana:
 2 |   image: "grafana/grafana:10.4.0"
 3 |   service:
 4 |     type: "NodePort"
 5 |     port: 3000
 6 |     targetPort: 3000
 7 | influxdb:
 8 |   image: "influxdb:1.8.10"
 9 |   disableReporting: "true"
10 |   dbName: "graphite"
11 |   service:
12 | #   type: "LoadBalancer"   
13 | #   lbSourceRange: "128.141.0.0/16"
14 |     type: "NodePort"
15 |     influx:
16 |       port: 8086
17 |       targetPort: 8086
18 |     graphite:
19 |       port: 2003
20 |       targetPort: 2003
21 |   storage:
22 | #    class: "MyStorageClass"
23 | #    type: "ReadWriteOnce"
24 |      size: "1Gi"
25 | 
26 | 


--------------------------------------------------------------------------------
/charts_v1/templates/influx_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ .Release.Name }}-influx
 5 | {{ with .Values.influxdb }}
 6 | spec:
 7 |   selector:
 8 |     app: influx
 9 |   type: {{ .service.type }}
10 |   {{- if and .service.lbSourceRange (eq .service.type "LoadBalancer") }}
11 |   loadBalancerSourceRanges:
12 |   - {{ .service.lbSourceRange }} 
13 |   {{- end }}
14 |   ports:
15 |   - protocol: TCP
16 |     name: influx
17 |     port: {{ .service.influx.port }}
18 |     targetPort: {{ .service.influx.targetPort }}
19 |   - protocol: TCP
20 |     name: graphite
21 |     port: {{ .service.graphite.port }}
22 |     targetPort: {{ .service.graphite.targetPort }}
23 | {{- end }}
24 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/influx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 | - name: influx-sparkmeasure
 5 |   type: influxdb
 6 |   access: proxy
 7 |   orgId: 1
 8 |   url: http://localhost:8086
 9 |   password:
10 |   user:
11 |   database:
12 |     sparkmeasure
13 |   basicAuth:
14 |   basicAuthUser:
15 |   basicAuthPassword:
16 |   withCredentials:
17 |   isDefault:
18 |   version: 1
19 |   editable: true
20 | - name: influx-graphite
21 |   type: influxdb
22 |   access: proxy
23 |   orgId: 1
24 |   url: http://localhost:8086
25 |   password:
26 |   user:
27 |   database:
28 |     graphite
29 |   basicAuth:
30 |   basicAuthUser:
31 |   basicAuthPassword:
32 |   withCredentials:
33 |   isDefault:
34 |   version: 1
35 |   editable: true
36 | 
37 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/telegraf.conf:
--------------------------------------------------------------------------------
 1 | [[inputs.socket_listener]]
 2 |   service_address = "tcp://:2003"
 3 |   data_format = "graphite"
 4 |   separator = "."
 5 |   templates = [
 6 |     # JVM source
 7 |     "*.*.jvm.pools.* username.applicationid.executorid.namespace.namespace.measurement*",
 8 |     # YARN source
 9 |     "*.*.applicationMaster.* username.applicationid.namespace.measurement*",
10 |     # shuffle service source
11 |     "*.shuffleService.* username.namespace.measurement*",
12 |     # streaming
13 |     "*.*.*.spark.streaming.* username.applicationid.executorid.namespace.namespace.id.measurement*",
14 |     # generic template for driver and executor sources
15 |     "username.applicationid.executorid.namespace.measurement*" ]
16 | 
17 | [[outputs.http]]
18 |   ## URL is the address to send metrics to
19 |   url = "http://localhost:8428/api/v1/write"
20 |   method = "POST"
21 |   data_format = "prometheusremotewrite"
22 |   tagexclude = ["host", "namespace"]
23 | 
24 | # Configure if needed
25 | #[agent]
26 | #  interval = "10s"
27 | #  flush_interval = "10s"
28 | #  flush_jitter = "0s"
29 | 
30 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/influxdb.conf:
--------------------------------------------------------------------------------
 1 | # influxdb conf
 2 | 
 3 | [meta]
 4 |   dir = "/var/lib/influxdb/meta"
 5 | 
 6 | [data]
 7 |   dir = "/var/lib/influxdb/data"
 8 |   engine = "tsm1"
 9 |   wal-dir = "/var/lib/influxdb/wal"
10 | 
11 | # Note Grafana http endpoint is on port 8086 by default.
12 | 
13 | [[graphite]]
14 |   enabled = true
15 |   bind-address = ":2003"
16 |   database = "graphite"
17 |   retention-policy = ""
18 |   protocol = "tcp"
19 |   batch-size = 5000
20 |   batch-pending = 10
21 |   batch-timeout = "1s"
22 |   consistency-level = "one"
23 |   separator = "."
24 |   udp-read-buffer = 0
25 |   templates = [
26 |     # JVM source
27 |     "*.*.jvm.pools.* username.applicationid.process.namespace.namespace.measurement*",
28 |     # YARN source
29 |     "*.*.applicationMaster.* username.applicationid.namespace.measurement*",
30 |     # shuffle service source
31 |     "*.shuffleService.* username.namespace.measurement*",
32 |     # streaming
33 |     "*.*.*.spark.streaming.* username.applicationid.process.namespace.namespace.id.measurement*",
34 |     # generic template for driver and executor sources
35 |     "username.applicationid.process.namespace.measurement*" ]
36 | 


--------------------------------------------------------------------------------
/charts_v1/templates/grafana_pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ .Release.Name }}-grafana
 5 |   labels:
 6 |     app: grafana
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: grafana
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: grafana
16 |     spec:
17 |       containers:
18 |       - name: grafana
19 |         image: {{ .Values.grafana.image }}
20 |         volumeMounts:
21 |         - name: datasource-conf
22 |           mountPath: /etc/grafana/provisioning/datasources/influx.yaml
23 |           subPath: influx-datasource-config
24 |         - name: datasource-conf
25 |           mountPath: /etc/grafana/provisioning/dashboards/spark.yaml
26 |           subPath: spark-dashboard-config
27 |         - name: spark-dashboard
28 |           mountPath: /var/lib/grafana/dashboards
29 |         ports:
30 |         - containerPort: {{ .Values.grafana.service.port }}
31 |       volumes:
32 |         - name: datasource-conf
33 |           configMap:
34 |             name: {{ .Release.Name }}-datasource
35 |         - name: spark-dashboard
36 |           configMap:
37 |             name: {{ .Release.Name }}-dashboard
38 | 


--------------------------------------------------------------------------------
/charts_v1/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "spark_dashboard.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "spark_dashboard.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "spark_dashboard.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/README.md:
--------------------------------------------------------------------------------
 1 | # How to build and run the legacy (v01) Spark dashboard in a container image
 2 | 
 3 | ## How to run
 4 | Run the dashboard v01 using a container image from [Dockerhub](https://hub.docker.com/r/lucacanali/spark-dashboard):
 5 | - There are a few ports needed and multiple options on how to expose them
 6 | - Port 2003 is for Graphite ingestion, port 3000 is for Grafana, port 8086 is used internally by the Grafana source
 7 | - You can expose the ports from the container individually or just make `network=host`.
 8 | - Examples:
 9 | ```
10 | docker run --network=host -d lucacanali/spark-dashboard:v01
11 | or
12 | docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard:v01
13 | or
14 | docker run -p 3000:3000 -p 2003:2003 -p 8086:8086 -d lucacanali/spark-dashboard:v01
15 | ```
16 | 
17 | ## Advanced: persist InfluxDB data across restarts
18 | - This shows an example of how to use a volume to store InfluxDB data. 
19 |   It allows preserving the history across runs when the container is restarted,
20 |   otherwise InfluxDB starts from scratch each time.
21 | ```
22 | docker run --network=host -v MYPATH/myinfluxdir:/var/lib/influxdb -d lucacanali/spark-dashboard:v01
23 | ```
24 | 
25 | ## How to build the image:
26 | ```
27 | docker build -t spark-dashboard:v01 .
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/dockerfiles_v1/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Container image for Spark Dashboard
 2 | # using InfluxDB and Grafana
 3 | 
 4 | FROM ubuntu:22.04
 5 | 
 6 | ENV INFLUXDB_VERSION 1.8.10
 7 | ENV GRAFANA_VERSION 10.4.0
 8 | ENV ARCH amd64
 9 | 
10 | RUN set -ex && \
11 |     apt-get update && \
12 |     apt-get install -qq -y curl libfontconfig musl && \
13 |     curl -O https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${ARCH}.deb && \
14 |     dpkg -i grafana_${GRAFANA_VERSION}_${ARCH}.deb && \
15 |     rm -f grafana_${GRAFANA_VERSION}_${ARCH}.deb && \
16 |     curl -O https://dl.influxdata.com/influxdb/releases/influxdb_${INFLUXDB_VERSION}_${ARCH}.deb && \
17 |     dpkg -i influxdb_${INFLUXDB_VERSION}_${ARCH}.deb && \
18 |     rm -f influxdb_${INFLUXDB_VERSION}_${ARCH}.deb
19 | 
20 | COPY influxdb.conf /etc/influxdb/influxdb.conf
21 | COPY --chown=grafana:grafana grafana_dashboards /var/lib/grafana/dashboards
22 | COPY --chown=grafana:grafana influx.yaml /etc/grafana/provisioning/datasources/influx.yaml
23 | COPY --chown=grafana:grafana spark.yaml /etc/grafana/provisioning/dashboards/spark.yaml
24 | COPY entrypoint.sh /opt/entrypoint.sh
25 | 
26 | # expose grafana dashboard (3000) and influxdb graphite (2003) and http (8086) endpoints
27 | EXPOSE 3000/tcp 2003/tcp 8086/tcp
28 | 
29 | WORKDIR /
30 | ENTRYPOINT [ "/opt/entrypoint.sh" ]
31 | 


--------------------------------------------------------------------------------
/charts_v1/templates/influx_graphiteconf.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ .Release.Name }}-config 
 5 | data:
 6 |   influx-graphite-config: |
 7 |     [meta]
 8 |       dir = "/var/lib/influxdb/meta"
 9 |     
10 |     [data]
11 |       dir = "/var/lib/influxdb/data"
12 |       engine = "tsm1"
13 |       wal-dir = "/var/lib/influxdb/wal"
14 |     
15 |     [[graphite]]
16 |       enabled = true
17 |       bind-address = ":2003"
18 |       database = "graphite"
19 |       retention-policy = ""
20 |       protocol = "tcp"
21 |       batch-size = 5000
22 |       batch-pending = 10
23 |       batch-timeout = "1s"
24 |       consistency-level = "one"
25 |       separator = "."
26 |       udp-read-buffer = 0
27 |       templates = [
28 |         # JVM source
29 |         "*.*.jvm.pools.* username.applicationid.process.namespace.namespace.measurement*",
30 |         # YARN source
31 |         "*.*.applicationMaster.* username.applicationid.namespace.measurement*",
32 |         # shuffle service source
33 |         "*.shuffleService.* username.namespace.measurement*",
34 |         # streaming
35 |         "*.*.*.spark.streaming.* username.applicationid.process.namespace.namespace.id.measurement*",
36 |         # generic template for driver and executor sources
37 |         "username.applicationid.process.namespace.measurement*" ]
38 | 


--------------------------------------------------------------------------------
/charts_v1/templates/grafana_datasource.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ .Release.Name }}-datasource
 5 | data:
 6 |   influx-datasource-config: |
 7 |     apiVersion: 1
 8 |     
 9 |     datasources:
10 |     - name: influx-sparkmeasure
11 |       type: influxdb
12 |       access: proxy
13 |       orgId: 1
14 |       url: http://{{ .Release.Name }}-influx:8086
15 |       password:
16 |       user:
17 |       database:
18 |         sparkmeasure
19 |       basicAuth:
20 |       basicAuthUser:
21 |       basicAuthPassword:
22 |       withCredentials:
23 |       isDefault:
24 |       version: 1
25 |       editable: true
26 |     - name: influx-graphite
27 |       type: influxdb
28 |       access: proxy
29 |       orgId: 1
30 |       url: http://{{ .Release.Name }}-influx:8086
31 |       password:
32 |       user:
33 |       database:
34 |         graphite
35 |       basicAuth:
36 |       basicAuthUser:
37 |       basicAuthPassword:
38 |       withCredentials:
39 |       isDefault:
40 |       version: 1
41 |       editable: true
42 |   spark-dashboard-config: |
43 |     apiVersion: 1
44 |     
45 |     providers:
46 |     - name: spark-dashboard
47 |       orgId: 1
48 |       folder: ''
49 |       folderUid: ''
50 |       type: file
51 |       disableDeletion: false
52 |       editable: true
53 |       updateIntervalSeconds: 10
54 |       options:
55 |         path: /var/lib/grafana/dashboards
56 | 


--------------------------------------------------------------------------------
/charts_v1/templates/influx_pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ .Release.Name }}-influx
 5 |   labels:
 6 |     app: influx
 7 | {{- with .Values.influxdb }}
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: influx
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: influx
17 |     spec:
18 |       containers:
19 |       - name: influx
20 |         image: {{ .image }}
21 |         ports:
22 |         - containerPort: {{ .service.graphite.port }}
23 |         - containerPort: {{ .service.influx.port }}
24 |         volumeMounts:
25 |         - name: influx-data
26 |           mountPath: /var/lib/influxdb
27 |         - name: graphite-config
28 |           mountPath: /etc/influxdb/influxdb.conf
29 |           subPath: influx-graphite-config 
30 |         env:
31 |         - name: INFLUXDB_REPORTING_DISABLED
32 |           value: {{ .disableReporting | quote}}
33 |         - name: INFLUXDB_DB
34 |           value: {{ .dbName | quote }}
35 | {{- end }}
36 |       volumes:
37 |         - name: graphite-config
38 |           configMap:
39 |             name: {{ .Release.Name }}-config 
40 |         - name: influx-data
41 |           {{- if .Values.influxdb.storage.class }}
42 |           persistentVolumeClaim:
43 |             claimName: {{ .Release.Name }}-influx
44 |           {{- else }}
45 |           emptyDir: {}
46 |           {{- end }}
47 | 
48 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/README.md:
--------------------------------------------------------------------------------
 1 | # How to build and run the Spark dashboard in a container image
 2 | 
 3 | ## How to run
 4 | Run the dashboard using a container image from [Dockerhub](https://hub.docker.com/r/lucacanali/spark-dashboard):
 5 | - There are a few ports needed and multiple options on how to expose them
 6 | - Port 2003 is for Graphite ingestion, port 3000 is for Grafana, port 8428 is used internally by VictoriaMetrics source
 7 | - You can expose the ports from the container individually or just make `network=host`.
 8 | - Examples:
 9 | ```
10 | docker run --network=host -d lucacanali/spark-dashboard
11 | or
12 | docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard
13 | or
14 | docker run -p 3000:3000 -p 2003:2003 -p 8428:8428 -d lucacanali/spark-dashboard
15 | ```
16 | 
17 | ## Persisting VictoriaMetrics Data Across Restarts
18 | By default, VictoriaMetrics does not retain data between container restarts—each time the container starts, it begins with an empty dataset. 
19 | To preserve historical metrics, you need to mount a persistent volume for data storage.
20 | 
21 | Below is an example of how to do this using a local directory:
22 | 
23 | ```
24 | # Create a directory to store VictoriaMetrics data
25 | mkdir metrics_data
26 | 
27 | # Run the container with the local directory mounted as the data volume.
28 | # This ensures your metrics history survives container restarts.
29 | docker run --network=host \
30 |   -v ./metrics_data:/victoria-metrics-data \
31 |   -d lucacanali/spark-dashboard:v02
32 | ```
33 | 
34 | ## Example of how to build the image:
35 | ```
36 | cd dockerfiles_v2
37 | docker build -t spark-dashboard:v02 .
38 | ```
39 | 
40 | 


--------------------------------------------------------------------------------
/charts_v1/README.md:
--------------------------------------------------------------------------------
 1 | # How to install the Helm Chart for the Spark Dashboard
 2 | 
 3 | The Helm chart is installed using [helm](https://helm.sh/docs/intro/quickstart/):
 4 | ```
 5 | helm install spark-dashboard https://github.com/cerndb/spark-dashboard/raw/master/charts/spark-dashboard-0.3.0.tgz
 6 | ```  
 7 | 
 8 | Other installation options:
 9 |  
10 | ```
11 | # Install from source.
12 | # Prerequisite: download the repo and cd into the charts directory
13 | helm install spark-dashboard -f values.yaml .
14 | ```  
15 | 
16 | ```
17 | # Re-package and install
18 | helm package .
19 | helm install spark-dashboard spark-dashboard-0.3.0.tgz
20 | ```
21 | 
22 | Additional admin commands:
23 | ```
24 | # Update the chart (after repackaging)
25 | helm upgrade --install spark-dashboard spark-dashboard-0.3.0.tgz
26 | 
27 | # uninstall
28 | helm uninstall spark-dashboard
29 | 
30 | # list and display installed components
31 | help list
32 | kubectl get service spark-dashboard-grafana spark-dashboard-influx
33 | kubectl get pods |grep spark-dashboard
34 | kubectl get configmaps |grep spark-dashboard
35 | ```
36 | 
37 | ## Configuration options
38 | 
39 | The provided configuration is for testing purposes, for production use you may need further configuration, as typical for these type of components.
40 | - The storage for influxDB can be defined in the `values.yaml`
41 |   - If no storageClass is provided, an `EmptyDir` will be allocated: the dashboard history will be lost when the
42 |    underlying pod is restarted. You may rather want to use a persistent backend in the configuration.
43 | - The services exposed by `grafana` and `influx` in the example are of type `NodePort`. You can use `LoadBalancer` type if your Kubernetes distribution supports it.
44 | 


--------------------------------------------------------------------------------
/dockerfiles_v2/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:24.04
 2 | 
 3 | ENV TELEGRAF_VERSION 1.37.0-1
 4 | ENV GRAFANA_VERSION 12.3.0
 5 | ENV VM_VERSION v1.131.0
 6 | ENV ARCH amd64
 7 | ENV GRAFANA_VM_PLUGIN_VERSION v0.19.7
 8 | ENV PLUGIN_PATH /var/lib/grafana/plugins
 9 | 
10 | # Download and install Grafana
11 | RUN set -ex && \
12 |     apt-get update && \
13 |     apt-get install -qq -y curl libfontconfig musl adduser && \
14 |     curl -O https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${ARCH}.deb && \
15 |     dpkg -i grafana_${GRAFANA_VERSION}_${ARCH}.deb && \
16 |     rm -f grafana_${GRAFANA_VERSION}_${ARCH}.deb
17 | 
18 | # Copy the bundled dashboards for the spark-dashboard
19 | COPY grafana_dashboards /var/lib/grafana/dashboards
20 | COPY spark.yaml /etc/grafana/provisioning/dashboards/spark.yaml
21 | 
22 | # Install and configure Grafana datasource for VictoriaMetric
23 | RUN set -ex && \
24 |     curl -L -O https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/download/${GRAFANA_VM_PLUGIN_VERSION}/victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \
25 |     tar -xzf victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz && \
26 |     find victoriametrics-metrics-datasource -type f -name "victoriametrics_backend_plugin*" ! -name "*linux_amd64" -exec rm -f {} + && \
27 |     mkdir ${PLUGIN_PATH} && \
28 |     mv victoriametrics-metrics-datasource ${PLUGIN_PATH} && \
29 |     rm victoriametrics-metrics-datasource-${GRAFANA_VM_PLUGIN_VERSION}.tar.gz
30 | 
31 | COPY grafana.ini /etc/grafana/grafana.ini
32 | COPY victoriametrics-metrics-datasource.yml /etc/grafana/provisioning/datasources/victoriametrics-metrics-datasource.yml
33 | 
34 | # Install and configure Telegraf
35 | RUN set -ex && \
36 |     curl -O https://repos.influxdata.com/debian/packages/telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \
37 |     dpkg -i telegraf_${TELEGRAF_VERSION}_${ARCH}.deb && \
38 |     rm -f telegraf_${TELEGRAF_VERSION}_${ARCH}.deb
39 | 
40 | COPY telegraf.conf /etc/telegraf/telegraf.conf
41 | 
42 | # Download and install VictoriaMetrics (VM)
43 | RUN set -ex && \
44 |     curl -L -O https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz && \
45 |     tar -xzvf victoria-metrics-*.tar.gz && \
46 |     rm -f victoria-metrics-linux-${ARCH}-${VM_VERSION}.tar.gz
47 | 
48 | # Copy the entrypoint script, it contains the startup commands
49 | COPY entrypoint.sh /opt/entrypoint.sh
50 | 
51 | # Expose the ports for Grafana, Telegraf and VictoriaMetrics
52 | EXPOSE 3000/tcp 2003/tcp 8428/tcp
53 | 
54 | WORKDIR /
55 | ENTRYPOINT [ "/opt/entrypoint.sh" ]
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spark-Dashboard
  2 | Real-Time Spark Monitoring & Performance Troubleshooting
  3 | 
  4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14718682.svg)](https://doi.org/10.5281/zenodo.14718682)
  5 | [![Docker Pulls](https://img.shields.io/docker/pulls/lucacanali/spark-dashboard)](https://hub.docker.com/r/lucacanali/spark-dashboard)
  6 | 
  7 | **Spark-Dashboard** offers a simple, intuitive interface for real-time monitoring of Apache Spark clusters.
  8 | It visualizes key metrics, CPU, memory, task throughput, I/O, and more, as time series, making it easy to track trends, spot issues,
  9 | and analyze workload evolution.
 10 | Ideal for engineers and data teams, Spark-Dashboard streamlines Spark troubleshooting and root cause analysis.
 11 | 
 12 | ## Key Features
 13 | 
 14 | - **Real-time Performance Monitoring:**   
 15 |   Visualize the evolution of Spark and system metrics, including CPU, memory, active tasks, and I/O, over time. Instantly spot trends and anomalies.
 16 | 
 17 | - **Real-Time Visualization:**  
 18 |   Integrated with Grafana for dynamic, interactive visualizations, enabling fast and effective performance analysis.
 19 | 
 20 | - **Broad Compatibility:**  
 21 |   Works with all major Apache Spark versions (4.x, 3.x) and across diverse cluster environments: Hadoop, Kubernetes, and Spark Standalone.
 22 |   
 23 | ### Contents
 24 | - [Architecture](#architecture)
 25 | - [How To Deploy the Spark Dashboard V2](#how-to-deploy-the-spark-dashboard)
 26 |   - [How to run the Spark Dashboard V2 on a container](#how-to-run-the-spark-dashboard-v2-on-a-container)
 27 |   - [Persisting metric storage across container restarts](https://github.com/cerndb/spark-dashboard#persisting-victoriametrics-data-across-restarts)
 28 |   - [Extended Spark dashboard](#extended-spark-dashboard)
 29 | - [Notes on Running Spark Dashboard on Spark Connect](#notes-on-running-spark-dashboard-on-spark-connect) 
 30 | - [Examples and getting started with Spark Performance dashboards](#examples-and-getting-started-with-spark-performance-dashboards) 
 31 |   - [Start small, testing with Spark in local mode](#start-small-testing-with-spark-in-local-mode)
 32 |   - [Measuring with Spark Dashboard while running TPCDS on a Spark cluster](#running-tpcds-on-a-spark-cluster)
 33 | - [Old implementation (V1)](#old-implementation-v1)
 34 |   - [How to run the Spark dashboard V1 on a container](#how-to-run-the-spark-dashboard-v1-on-a-container)
 35 |   - [How to run the dashboard V1 on Kubernetes using Helm](#how-to-run-the-dashboard-v1-on-kubernetes-using-helm)
 36 | - [Advanced configurations and notes](#advanced-configurations-and-notes)
 37 | 
 38 | ### Resources
 39 | - [![Watch the video](https://www.youtube.com/s/desktop/050e6796/img/favicon_32x32.png) Watch Spark-Dashboard demo and tutorial](https://www.youtube.com/watch?v=sLjAyDwpg80)
 40 | - Notes on [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard)
 41 | - Blog on [building an Apache Spark Performance Lab](https://db-blog.web.cern.ch/node/195)
 42 | - Blog [on Spark Dashboard](https://db-blog.web.cern.ch/blog/luca-canali/2019-02-performance-dashboard-apache-spark)
 43 | - Talk on Spark performance at [Data+AI Summit 2021](https://databricks.com/session_na21/monitor-apache-spark-3-on-kubernetes-using-metrics-and-plugins), [slides](http://canali.web.cern.ch/docs/Monitor_Spark3_on_Kubernetes_DataAI2021_LucaCanali.pdf)
 44 | - [sparkMeasure](https://github.com/LucaCanali/sparkMeasure) a tool for performance troubleshooting of Apache Spark workloads
 45 | - [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) a TPC-DS workload generator written in Python and designed to run at scale using Apache Spark
 46 | 
 47 | Main author and contact: Luca.Canali@cern.ch  
 48 | 
 49 | ---
 50 | ## Architecture
 51 | 
 52 | ![Spark metrics dashboard architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_MetricsSystem_Grafana_Dashboard_V2.0.png "Spark metrics dashboard architecture")
 53 | 
 54 | This diagram illustrates an end-to-end monitoring pipeline for Apache Spark built entirely on open-source components. 
 55 | The architecture is designed to deliver real-time insights into the performance and health of your Spark clusters through
 56 | a seamless flow of data from metric generation to visualization.
 57 | 
 58 | - **Apache Spark Metrics:**  
 59 |   Apache Spark generates detailed performance metrics via its [metrics system](https://spark.apache.org/docs/latest/monitoring.html#metrics).
 60 |   Both the driver and executors emit a wide range of metrics—such as runtime, CPU usage, garbage collection (GC) time, memory utilization, shuffle statistics, and I/O metrics—in Graphite format.
 61 | 
 62 | - **Telegraf:**  
 63 |   Acting as the collection agent, Telegraf ingests the metrics emitted by Spark. It enriches these measurements with additional
 64 |   labels and tags to facilitate effective organization and analysis before forwarding them to the storage backend.
 65 | 
 66 | - **VictoriaMetrics:**  
 67 |   This robust time-series database efficiently stores the labeled metrics data. Its design is optimized for handling large volumes
 68 |   of timestamped, sequential data, making it ideal for monitoring and historical trend analysis.
 69 | 
 70 | - **Grafana:**  
 71 |   Grafana provides a dynamic visualization layer, querying VictoriaMetrics using PromQL/MetricsQL. The result is a set of interactive
 72 |   dashboards that display real-time metrics and trends, empowering users to monitor system performance and swiftly identify any bottlenecks.
 73 | 
 74 | Together, these components form a cohesive and scalable monitoring solution tailored for Apache Spark environments.
 75 | 
 76 | ---
 77 | ## How To Deploy the Spark Dashboard 
 78 | 
 79 | This quickstart guide presents multiple methods for deploying Spark Dashboard. The **recommended** approach is to deploy 
 80 | Spark-Dashboard v2 using a container.
 81 | 
 82 | ### How to run the Spark Dashboard V2 on a container
 83 | Follow these steps to get started with the container image:
 84 | 
 85 | #### 1. Start the container
 86 | The provided container image is pre-configured to run VictoriaMetrics (for metrics storage) and Grafana (for visualization). 
 87 | You can start the container using either Docker or Podman:
 88 | 
 89 | - **Using Docker:**
 90 | 
 91 |   `docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard`
 92 | 
 93 | - **Using Podman:**
 94 | 
 95 |   `podman run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard`
 96 | 
 97 | #### 2. Configure Apache Spark
 98 | 
 99 | To send metrics from Spark to the dashboard, configure Spark to export its metrics to the Graphite endpoint provided by the container.
100 | 
101 | **Method A: Using `metrics.properties`**
102 | 
103 | Edit the `metrics.properties` file located in `$SPARK_CONF_DIR` and add the following configuration:
104 | 
105 |     # Configure Graphite sink for Spark metrics
106 |     *.sink.graphite.host=localhost
107 |     *.sink.graphite.port=2003
108 |     *.sink.graphite.period=10
109 |     *.sink.graphite.unit=seconds
110 |     *.sink.graphite.prefix=lucatest
111 | 
112 |     # Enable JVM metrics collection
113 |     *.source.jvm.class=org.apache.spark.metrics.source.JvmSource
114 | 
115 | Optionally, add these settings to your Spark launch configuration (or `spark-defaults.conf`):
116 | 
117 |     --conf spark.metrics.staticSources.enabled=true
118 |     --conf spark.metrics.appStatusSource.enabled=true
119 | 
120 | **Method B: Passing Configuration via Command-Line**
121 | 
122 | Alternatively, you can specify Spark metrics settings directly when launching your Spark application. For example:
123 | 
124 |     # Define the VictoriaMetrics Graphite endpoint (replace `hostname` with your actual host if needed)
125 |     VICTORIAMETRICS_ENDPOINT=$(hostname)
126 | 
127 |     bin/spark-shell \
128 |       --conf "spark.metrics.conf.*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink" \
129 |       --conf "spark.metrics.conf.*.sink.graphite.host=${VICTORIAMETRICS_ENDPOINT}" \
130 |       --conf "spark.metrics.conf.*.sink.graphite.port=2003" \
131 |       --conf "spark.metrics.conf.*.sink.graphite.period=10" \
132 |       --conf "spark.metrics.conf.*.sink.graphite.unit=seconds" \
133 |       --conf "spark.metrics.conf.*.sink.graphite.prefix=lucatest" \
134 |       --conf "spark.metrics.conf.*.source.jvm.class=org.apache.spark.metrics.source.JvmSource" \
135 |       --conf "spark.metrics.staticSources.enabled=true" \
136 |       --conf "spark.metrics.appStatusSource.enabled=true"
137 | 
138 | *Optional:* To also collect and display "Tree Process Memory Details", add:
139 | 
140 |     --conf spark.executor.processTreeMetrics.enabled=true
141 | 
142 | #### 3. Visualize Metrics in Grafana
143 | 
144 | Once the container is running and Spark is configured to export metrics, you can view the performance data through Grafana:
145 | 
146 | - **Access Grafana:**  
147 |   Open your web browser and navigate to [http://localhost:3000](http://localhost:3000) (replace `localhost` with your server's address if necessary).
148 | 
149 | - **Login Credentials:**  
150 |   Use the default credentials:  
151 |   **User:** `admin`  
152 |   **Password:** `admin`
153 | 
154 | - **Dashboard Overview:**  
155 |   The bundled dashboard (**Spark_Perf_Dashboard_v04_promQL**) displays a summary of key metrics (such as runtime, CPU usage, I/O, shuffle, task counts, etc.) along with detailed timeseries graphs. Select the appropriate username, application ID, and time range (default is the last 5 minutes) to customize your view.
156 | 
157 | > **Important:**  
158 | > Ensure that you have a running Spark application configured as detailed above so that metrics are available for selection and display.
159 | 
160 | For testing purposes, you can generate load on Spark using [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark), a TPC-DS workload generator written in Python and designed to run at scale with Apache Spark.
161 | 
162 | ----
163 | ### Persisting VictoriaMetrics Data Across Restarts
164 | 
165 | By default, VictoriaMetrics does not retain data between container restarts—each time the container starts, it begins with an empty dataset. 
166 | To preserve historical metrics, you need to mount a persistent volume for data storage.
167 | 
168 | Below is an example of how to do this using a local directory:
169 | 
170 | ```
171 | # Create a directory to store VictoriaMetrics data
172 | mkdir metrics_data
173 | 
174 | # Run the container with the local directory mounted as the data volume.
175 | # This ensures your metrics history survives container restarts.
176 | docker run --network=host \
177 |   -v ./metrics_data:/victoria-metrics-data \
178 |   -d lucacanali/spark-dashboard:v02
179 | ```
180 | 
181 | ---
182 | ### Extended Spark Dashboard
183 | 
184 | Enhance your monitoring capabilities with the Extended Spark Dashboard, which collects and visualizes OS and storage metrics alongside standard Spark performance data. This enhanced pipeline leverages [Spark Plugins](https://github.com/cerndb/SparkPlugins) to gather additional metrics, all stored within the same VictoriaMetrics database as the standard Spark metrics.
185 | 
186 | #### Additional Dashboard Features
187 | 
188 | The extended dashboard introduces three extra groups of graphs beyond those available in the standard Spark Dashboard:
189 | 
190 | - **CGroup Metrics**  
191 |   Collects data via CGroup instrumentation—ideal for Spark running on Kubernetes.
192 | 
193 | - **Cloud Storage**  
194 |   Displays metrics from block storage systems such as S3A, GZ, WASB, and other cloud storage services.
195 | 
196 | - **HDFS Advanced Statistics**  
197 |   Provides deeper insights into HDFS usage, offering additional performance metrics when Spark leverages HDFS.
198 | 
199 | #### Configuration
200 | 
201 | To enable extended metrics, add the following configurations to your Spark setup:
202 | 
203 |     --conf ch.cern.sparkmeasure:spark-plugins_2.12:0.4
204 |     --conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics
205 | 
206 | #### Using the Extended Dashboard
207 | 
208 | After configuring Spark, select the extended dashboard in Grafana to view the additional metrics:
209 | 
210 | - **Dashboard Name:** `Spark_Perf_Dashboard_v04_PromQL_with_SparkPlugins`
211 | - The dashboard includes extra graphs for OS and storage metrics, offering a comprehensive view of your system's performance.
212 | 
213 | ----
214 | ### Notes on Running Spark Dashboard on Spark Connect
215 | 
216 | [Spark Connect](https://spark.apache.org/docs/latest/spark-connect-overview.html) allows you to run a lightweight Spark client that connects remotely to your Spark cluster.  
217 | When using Spark Connect, **Spark-Dashboard must be started on the Spark Connect server**, not on the client. Follow these steps:
218 | 
219 | 1. **Start the Spark-Dashboard container**  
220 |    (see instructions above).
221 | 2. **Edit the `metrics.properties` file**  
222 |    in the Spark Connect `conf` directory as described above.
223 | 3. **Start Spark Connect**  
224 |    ```
225 |    sbin/start-connect-server.sh
226 |    ```
227 | Metrics from Spark Connect will now be sent to the Spark-Dashboard container and visualized in Grafana.
228 | 
229 | -----
230 | ## Examples and getting started with Spark Performance dashboards:
231 | - See some [examples of the dashboard graphs at this link](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard#example-graphs)
232 | 
233 | ### Start small, testing with Spark in local mode
234 | - You can use the [TPCDS_PySpark](https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark) package to generate a TPC-DS workload and test the dashboard.
235 | - Run the following on local resources or cloud, for example use GitHub Codespaces from this repo
236 |   - [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/cerndb/spark-dashboard)
237 | ```
238 | # Install the tool and dependencies
239 | pip install pyspark 
240 | pip install sparkmeasure 
241 | pip install tpcds_pyspark 
242 | 
243 | # Download the test data
244 | wget https://sparkdltrigger.web.cern.ch/sparkdltrigger/TPCDS/tpcds_10.zip
245 | unzip -q tpcds_10.zip
246 | 
247 | # 1. Run the tool for a minimal test
248 | tpcds_pyspark_run.py -d tpcds_10 -n 1 -r 1 --queries q1,q2
249 | 
250 | # 2. Start the dashboard and visualize the metrics (use docker or podman)
251 | docker run -p 2003:2003 -p 3000:3000 -d lucacanali/spark-dashboard
252 | 
253 | # 3. run the tpcds workload sending metrics to the dashboard
254 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py`
255 | spark-submit --master local[*] \
256 | --conf "spark.metrics.conf.*.sink.graphite.class"="org.apache.spark.metrics.sink.GraphiteSink" \
257 | --conf "spark.metrics.conf.*.sink.graphite.host"="localhost" \
258 | --conf "spark.metrics.conf.*.sink.graphite.port"=2003 \
259 | --conf "spark.metrics.conf.*.sink.graphite.period"=10 \
260 | --conf "spark.metrics.conf.*.sink.graphite.unit"=seconds \
261 | --conf "spark.metrics.conf.*.sink.graphite.prefix"="lucatest" \
262 | --conf "spark.metrics.conf.*.source.jvm.class"="org.apache.spark.metrics.source.JvmSource" \
263 | --conf "spark.metrics.staticSources.enabled"=true \
264 | --conf "spark.metrics.appStatusSource.enabled"=true \
265 | --conf spark.driver.memory=4g \
266 | --conf spark.log.level=error \
267 | --packages ch.cern.sparkmeasure:spark-measure_2.12:0.25 \
268 | $TPCDS_PYSPARK -d tpcds_10
269 | 
270 | # 4. Accessing the Grafana Dashboard:
271 | #    - Navigate to https://localhost:3000 to access the Grafana dashboard.
272 | #    - If using GitHub Codespaces, use the "Ports" tab to open a browser window for this address.
273 | #    - Default credentials for Grafana are username: admin and password: admin.
274 | #    - Optionally, open the Spark WebUI at http://localhost:4040 to monitor the Spark job.
275 | 
276 | # Wait a few minutes for metrics to populate the dashboard.
277 | # Note: This dashboard is more effective when Spark runs on cluster resources
278 | #       rather than in the local mode demonstrated here. For more details, refer to the next paragraph.
279 | ```
280 | 
281 | 
282 | ### Running TPCDS on a Spark cluster
283 | - Example of running TPCDS on a YARN Spark cluster, monitor with the Spark dashboard:
284 | ```
285 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py`
286 | 
287 | spark-submit --master yarn --conf spark.log.level=error --conf spark.executor.cores=8 --conf spark.executor.memory=64g \
288 | --conf spark.driver.memory=16g --conf spark.driver.extraClassPath=tpcds_pyspark/spark-measure_2.12-0.25.jar \
289 | --conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=32 --conf spark.sql.shuffle.partitions=512 \
290 | $TPCDS_PYSPARK -d hdfs://<PATH>/tpcds_10000_parquet_1.13.1
291 | ```
292 | 
293 | - Example of running TPCDS on a Kubernetes cluster with S3 storage, monitor this with the extended dashboard using Spark plugins:
294 | ```
295 | TPCDS_PYSPARK=`which tpcds_pyspark_run.py`
296 | 
297 | spark-submit --master k8s://https://xxx.xxx.xxx.xxx:6443 --conf spark.kubernetes.container.image=<URL>/spark:v3.5.1 --conf spark.kubernetes.namespace=xxx \
298 | --conf spark.eventLog.enabled=false --conf spark.task.maxDirectResultSize=2000000000 --conf spark.shuffle.service.enabled=false --conf spark.executor.cores=8 --conf spark.executor.memory=32g --conf spark.driver.memory=4g \
299 | --packages org.apache.hadoop:hadoop-aws:3.3.4,ch.cern.sparkmeasure:spark-measure_2.12:0.25,ch.cern.sparkmeasure:spark-plugins_2.12:0.4 --conf spark.plugins=ch.cern.HDFSMetrics,ch.cern.CgroupMetrics,ch.cern.CloudFSMetrics \
300 | --conf spark.cernSparkPlugin.cloudFsName=s3a \
301 | --conf spark.dynamicAllocation.enabled=false --conf spark.executor.instances=4 \
302 | --conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \
303 | --conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \
304 | --conf spark.hadoop.fs.s3a.endpoint="https://s3.cern.ch" \
305 | --conf spark.hadoop.fs.s3a.impl="org.apache.hadoop.fs.s3a.S3AFileSystem" \
306 | --conf spark.executor.metrics.fileSystemSchemes="file,hdfs,s3a" \
307 | --conf spark.hadoop.fs.s3a.fast.upload=true \
308 | --conf spark.hadoop.fs.s3a.path.style.access=true \
309 | --conf spark.hadoop.fs.s3a.list.version=1 \
310 | $TPCDS_PYSPARK -d s3a://luca/tpcds_100
311 | ```
312 | 
313 | ---
314 | ## Legacy implementation (spark-dashboard v1)
315 | 
316 | Note: spark-dashboard v1 (the original implementation) uses InfluxDB as the time-series database, see also
317 | [spark-dashabord v1 architecture](https://raw.githubusercontent.com/LucaCanali/Miscellaneous/master/Spark_Dashboard/images/Spark_metrics_dashboard_arch.PNG)
318 | 
319 | ### How to run the Spark dashboard V1 on a container
320 | This is the original implementation of the tool using InfluxDB and Grafana 
321 | 
322 | **1. Start the container**
323 | The provided container image has been built configured to run InfluxDB and Grafana
324 |   -`docker run -p 3000:3000 -p 2003:2003 -d lucacanali/spark-dashboard:v01` 
325 |  - Note: port 2003 is for Graphite ingestion, port 3000 is for Grafana
326 |  - More options, including on how to persist InfluxDB data across restarts at: [Spark dashboard in a container](dockerfiles_v1)
327 | 
328 | **2. Spark configuration**
329 | See above
330 | 
331 | **3. Visualize the metrics using a Grafana dashboard**
332 |   - Point your browser to `http://hostname:3000` (edit `hostname` as relevant)
333 |   - See details above
334 | 
335 | ---
336 | ### How to run the dashboard V1 on Kubernetes using Helm
337 | If you chose to run on Kubernetes, these are steps:
338 | 
339 | 1. The Helm chart takes care of configuring and running InfluxDB and Grafana:
340 |    - Quickstart: `helm install spark-dashboard https://github.com/cerndb/spark-dashboard/raw/master/charts/spark-dashboard-0.3.0.tgz`
341 |    - Details: [charts](charts)
342 |   
343 | 2. Spark configuration:
344 |    - Configure `metrics.properties` as detailed above.
345 |    - Use `INFLUXDB_ENDPOINT=spark-dashboard-influx.default.svc.cluster.local` as the InfluxDB endpoint in 
346 |      the Spark configuration.
347 | 
348 | 3. Grafana's visualization with Helm:
349 |    - The Grafana dashboard is reachable at port 3000 of the spark-dashboard-service.  
350 |    - See service details: `kubectl get service spark-dashboard-grafana`  
351 |    - When using NodePort and an internal cluster IP address, this is how you can port forward to the service from
352 |      the local machine: `kubectl port-forward service/spark-dashboard-grafana 3000:3000`
353 | 
354 | More info at [Spark dashboard on Kubernetes](charts/README.md)
355 | 
356 | ---
357 | ## Advanced configurations and notes
358 | 
359 | ### Graph annotations: display query/job/stage start and end times  
360 | Optionally, you can add annotation instrumentation to the performance dashboard v1.
361 | Annotations provide additional info on start and end times for queries, jobs and stages.
362 | To activate annotations, add the following additional configuration to spark-submit/spark-shell/pyspark,
363 | needed for collecting and writing extra performance data:
364 | ```
365 | INFLUXDB_HTTP_ENDPOINT="http://`hostname`:8086"
366 | 
367 | <spark-submit config>
368 | --packages ch.cern.sparkmeasure:spark-measure_2.12:0.25 \
369 | --conf spark.sparkmeasure.influxdbURL=$INFLUXDB_HTTP_ENDPOINT \
370 | --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSink \
371 | ```
372 | 
373 | ### Notes
374 | - More details on how this works and alternative configurations at [Spark Dashboard](https://github.com/LucaCanali/Miscellaneous/tree/master/Spark_Dashboard)
375 | - The dashboard can be used when running Spark on a cluster (Kubernetes, YARN, Standalone) or in local mode.  
376 | - When using Spark in local mode, use Spark version 3.1 or higher, see [SPARK-31711](https://issues.apache.org/jira/browse/SPARK-31711)
377 | 
378 | ### Docker / Podman
379 | - Telegraf will use port 2003 (graphite endpoint) and port 8428 (VictoriaMetrics source) of your machine/VM.
380 | - For dashboard v1: InfluxDB will use port 2003 (graphite endpoint), and port 8086 (http endpoint) of
381 |   your machine/VM (when running using `--network=host`).
382 | - Note: the endpoints need to be available on the node where you started the container and
383 |   reachable by Spark executors and driver (mind the firewall).
384 | 
385 | ### Helm
386 | - Find the InfluxDB endpoint IP with `kubectl get service spark-dashboard-influx`.
387 | - Optionally, resolve the DNS name with `nslookup` of such IP.
388 |   For example, the InfluxDB service host name of a test installation is: `spark-dashboard-influx.default.svc.cluster.local`
389 | 
390 | ### Customizing and adding new dashboards 
391 | 
392 | - This implementation comes with some example dashboards. Note that only a subset of the
393 | metrics values logged into VictoriaMetrics are visualized in the provided dashboard.
394 | - For a full list of the available metrics see the [documentation of Spark metrics system](https://github.com/apache/spark/blob/master/docs/monitoring.md#metrics).
395 | - New dashboards can be added by putting them in the relevant `grafana_dashboards` folder and re-building the container image
396 | (or  re-packaging the helm chart).
397 | - On Helm: running helm-update is enough to upload it as ConfigMap and make it available to Grafana. 
398 | - Automatically persisting manual edits is not supported at this time.
399 | 


--------------------------------------------------------------------------------