├── 2020-12-30-16-37-39.png
├── README.md
├── Running Spark on Kubernetes.pdf
├── applications
├── argocd
│ ├── .gitignore
│ ├── Chart.lock
│ ├── Chart.yaml
│ ├── argocd
│ │ └── argocd.yaml
│ ├── charts
│ │ └── argo-cd-3.0.0.tgz
│ ├── readme.md
│ └── values.yaml
├── platform
│ ├── argo-workflows
│ │ ├── Chart.lock
│ │ ├── Chart.yaml
│ │ ├── charts
│ │ │ └── argo-0.16.8.tgz
│ │ ├── dashboards
│ │ │ └── argo-workflows.json
│ │ ├── templates
│ │ │ ├── _helpers.tpl
│ │ │ └── configmap-dashboard.yaml
│ │ └── values.yaml
│ ├── monitoring
│ │ └── prometheus
│ │ │ ├── Chart.lock
│ │ │ ├── Chart.yaml
│ │ │ ├── README.md
│ │ │ ├── charts
│ │ │ └── kube-prometheus-stack-15.1.1.tgz
│ │ │ └── values.yaml
│ ├── rbac
│ │ ├── argo-workflow-sa.yaml
│ │ └── test-sa.yaml
│ └── spark-operator
│ │ ├── Chart.lock
│ │ ├── Chart.yaml
│ │ ├── charts
│ │ └── spark-operator-1.1.0.tgz
│ │ ├── dashboards
│ │ └── spark-operator.json
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ └── configmap-dashboard.yaml
│ │ └── values.yaml
└── spark-apps
│ ├── hello-argo-workflow-template
│ └── hello-argo-workflow-template.yaml
│ ├── hello-argo-workflows
│ └── hello-world-dag.yaml
│ ├── hello-spark-operator-argo-workflows
│ └── spark-operator-kubernetes-dag.yaml
│ ├── hello-spark-operator-history
│ └── spark-application.yaml
│ ├── hello-spark-operator-prometheus
│ ├── hello-spark-operator-prometheus.yaml
│ └── podMonitor.yaml
│ ├── hello-spark-operator
│ ├── ingress.yaml
│ └── spark-application.yaml
│ ├── hello-spark-submit
│ └── test-job-example.yaml
│ └── spark-history-server
│ ├── Chart.yaml
│ ├── templates
│ ├── _helpers.tpl
│ ├── configmap.yaml
│ ├── deployment.yaml
│ ├── ingress.yaml
│ ├── service.yaml
│ └── serviceaccount.yaml
│ └── values.yaml
├── deployments
└── applications
│ ├── platform
│ ├── Chart.yaml
│ ├── argo-apps.yaml
│ ├── templates
│ │ ├── argo-workflows.yaml
│ │ ├── prometheus.yaml
│ │ ├── rbac.yaml
│ │ └── spark-operator.yaml
│ └── values.yaml
│ └── spark-apps
│ ├── Chart.yaml
│ ├── argo-apps.yaml
│ ├── templates
│ ├── hello-argo-workflow-template.yaml
│ ├── hello-argo-workflows.yaml
│ ├── hello-spark-operator-argo-workflows.yaml
│ ├── hello-spark-operator-history.yaml
│ ├── hello-spark-operator.yaml
│ ├── hello-spark-submit.yaml
│ └── spark-history-server.yaml
│ └── values.yaml
├── kind
├── cluster-conf.yaml
├── example.yaml
└── ingress-nginx.yaml
├── spark-docker
├── Dockerfile
└── conf
│ ├── metrics.properties
│ └── prometheus.yaml
└── vanilla-k8s
├── argo-workflows
├── argo-cm.yaml
├── argo-workflow-sa.yaml
├── hello-world-dag.yaml
├── spark-kubernetes-dag.yaml
└── spark-operator-kubernetes-dag.yaml
├── spark-operator
└── spark-application.yaml
└── spark-submit
├── test-job-eks.yaml
├── test-job-example.yaml
└── test-sa.yaml
/2020-12-30-16-37-39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/2020-12-30-16-37-39.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark on Kubernetes
2 | The aim of this repo is show some samples about how to launch and orchestrate Spark jobs on Kubernetes
3 |
4 | The main points of this repo are:
5 | * Localhost deployment: Kind
6 | * Build Spark job as Docker image
7 | * Platform Applications Deployments
8 | * Spark Applications Deployments
9 | * Vanilla k8s
10 | * How to orchestrate the Spark jobs on Kubernetes: Argo Workflows
11 |
12 | Besides, you can follow the [slides](Running%20Spark%20on%20Kubernetes.pdf) for the [K8s Days Spain 2021](https://kcdspain.com/)
13 |
14 |
15 | ## Deploy cluster localhost
16 | * Deploy Kind cluster
17 | * Deploy Nginx Ingress
18 |
19 | ```sh
20 | kind create cluster --config=kind/cluster-conf.yaml
21 |
22 | kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/master/deploy/static/provider/kind/deploy.yaml
23 |
24 | kubectl wait --namespace ingress-nginx \
25 | --for=condition=ready pod \
26 | --selector=app.kubernetes.io/component=controller \
27 | --timeout=90s
28 |
29 | ```
30 |
31 | ## Build image
32 | A build sample can be found in the next repo:
33 | https://github.com/aws-samples/amazon-eks-apache-spark-etl-sample
34 |
35 | ```sh
36 | git clone git@github.com:empathyco/amazon-eks-apache-spark-etl-sample.git
37 |
38 | cd amazon-eks-apache-spark-etl-sample
39 |
40 | docker build --target=spark -t k8s-test/spark:v2.4.4 .
41 |
42 | docker build -t k8s-test/spark-on-localhost:v1.0 .
43 |
44 | ```
45 |
46 | Now the image should be load to kind as follows:
47 |
48 | ```sh
49 | kind load docker-image k8s-test/spark:v2.4.4
50 |
51 | kind load docker-image k8s-test/spark-on-localhost:v1.0
52 | ```
53 |
54 | A Docker image with Prometheus and aws-hadoop
55 |
56 | ```
57 | cd ./spark-docker
58 | docker build -t k8s-test/spark-prometheus:v1.6 .
59 | kind load docker-image k8s-test/spark-prometheus:v1.6
60 | ```
61 |
62 | ## Platform Applications Deployments
63 |
64 | * [ArgoCD](applications/argocd/values.yaml)
65 | * [platform-apps](deployments/applications/platform/argo-apps.yaml)
66 | * [Argo-Workflows](applications/platform/argo-workflows/values.yaml)
67 | * [Spark Operator](applications/platform/spark-operator/values.yaml)
68 | * [kube-prometheus-stack](applications/platform/spark-operator/values.yaml)
69 |
70 | ArgoCD
71 |
72 | ```sh
73 | cd ./applications/argocd
74 | helm install argocd . -f values.yaml
75 | ```
76 | ArgoCD will be deployed
77 |
78 | ```sh
79 | kubectl apply -f ./deployments/applications/platform/argo-apps.yaml
80 | ```
81 |
82 | All the platform applications will be created on ArgoCD
83 |
84 |
85 | ## Spark Applications Deployments
86 | * [Spark-apps](deployments/applications/spark-apps/argo-apps.yaml)
87 | * [hello-spark-submit](applications/spark-apps/hello-spark-submit/test-job-example.yaml)
88 | * [hello-spark-operator](applications/spark-apps/hello-spark-operator/spark-application.yaml)
89 | * [hello-argo-workflows](applications/spark-apps/hello-argo-workflows/hello-world-dag.yaml)
90 | * [hello-spark-operator-argo-workflows](applications/spark-apps/hello-spark-operator-argo-workflows/spark-operator-kubernetes-dag.yaml)
91 | * [hello-argo-workflow-template](applications/spark-apps/hello-argo-workflow-template/hello-argo-workflow-template.yaml)
92 | * [spark-history-server](applications/spark-apps/spark-history-server/values.yaml)
93 | * [hello-spark-operator-history](applications/spark-apps/hello-spark-operator-history/spark-application.yaml)
94 |
95 | ```sh
96 | kubectl apply -f ./deployments/applications/spark-apps/argo-apps.yaml
97 | ```
98 |
99 | ## Vanilla K8s
100 |
101 | ### Spark on Kubernetes
102 |
103 | #### Spark submit
104 |
105 | It's the easiest way to run Spark on Kubernetes
106 |
107 | ```sh
108 | # Check all possible clusters, as your .KUBECONFIG may have multiple contexts:
109 | kubectl config view -o jsonpath='{"Cluster name\tServer\n"}{range .clusters[*]}{.name}{"\t"}{.cluster.server}{"\n"}{end}'
110 |
111 | # Select name of cluster you want to interact with from above output:
112 | export CLUSTER_NAME="some_server_name"
113 |
114 | # Point to the API server referring the cluster name
115 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"$CLUSTER_NAME\")].cluster.server}")
116 |
117 | # Gets the token value
118 | TOKEN=$(kubectl get secrets -o jsonpath="{.items[?(@.metadata.annotations['kubernetes\.io/service-account\.name']=='default')].data.token}"|base64 --decode)
119 |
120 | # Explore the API with TOKEN
121 | curl -X GET $APISERVER/api --header "Authorization: Bearer $TOKEN" --insecure
122 | ```
123 |
124 | The output is similar to this:
125 |
126 | ```json
127 | {
128 | "kind": "APIVersions",
129 | "versions": [
130 | "v1"
131 | ],
132 | "serverAddressByClientCIDRs": [
133 | {
134 | "clientCIDR": "0.0.0.0/0",
135 | "serverAddress": "172.18.0.2:6443"
136 | }
137 | ]
138 | }
139 | ```
140 |
141 | Where serverAddress will be the APISERVER_IP for K8s in the example folder
142 |
143 | ```sh
144 | kubectl apply -f spark-submit/test-sa.yaml
145 |
146 | kubectl apply -f spark-submit/test-job-example.yaml
147 |
148 | kubectl apply -f spark-submit/test-job-eks.yaml (From localhost it fails because IAM roles permissions)
149 | ```
150 |
151 | #### Spark Operator
152 |
153 | Project status in beta, more info can be found in:
154 | [https://github.com/GoogleCloudPlatform/spark-on-k8s-operator](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator)
155 |
156 | ```sh
157 | helm repo add spark-operator https://googlecloudplatform.github.io/spark-on-k8s-operator
158 |
159 | helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace
160 |
161 | kubectl apply -f spark-operator/spark-application.yaml
162 | ```
163 |
164 | #### Spark UI
165 | ```sh
166 | kubectl port-forward spark-driver-po 4040:4040
167 | ```
168 |
169 | #### Argo Workflows
170 |
171 | Some Highlights:
172 | * API Rest to submit, get and delete workflows
173 | * Suspend execution, either for a duration or until it is resumed manually.
174 | * Directed Acyclic Graph (DAG): A set of steps and the dependencies between them
175 | * Workflow Template: a Kubernetes resource defining a reusable workflow for a namespace
176 | * Some more important features: https://github.com/argoproj/argo#features
177 |
178 |
179 | Installation and orchestrate two sequential Spark jobs in localhost.
180 |
181 | To be able to run Argo Workflow is needed to change the containerRuntimeExecutor to k8sapi [GitHub Issue](https://github.com/argoproj/argo/issues/2557#issuecomment-607239438)
182 |
183 | More info in:
184 | * https://github.com/argoproj/argo/blob/master/docs/workflow-executors.md
185 |
186 |
187 |
188 | ```sh
189 | kubectl create namespace argo
190 |
191 | kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/stable/manifests/install.yaml
192 |
193 | kubectl -n argo apply -f argo-workflows/argo-cm.yaml
194 |
195 | kubectl -n default apply argo-workflows/argo-workflows-sa.yaml
196 |
197 | kubectl -n argo port-forward deploy/argo-server 2746:2746
198 |
199 | ```
200 |
201 | Some examples:
202 | ```sh
203 | kubectl create -f argo-workflows/hello-world-dag.yaml
204 |
205 | kubectl create -f argo-workflows/spark-kubernetes-dag.yaml
206 |
207 | ```
208 | Hitting http://localhost:2746 the workflows can be found out
209 |
210 | 
211 |
212 | # Slides
213 |
214 | [Here](Running%20Spark%20on%20Kubernetes.pdf)
215 |
216 | # References
217 | * https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
218 | * https://spark.apache.org/docs/latest/running-on-kubernetes.html
219 | * https://medium.com/@surajrajanathrapully/implementing-and-integrating-argo-workflow-and-spark-on-kubernetes-aaada016c803
220 | * https://argoproj.github.io/argo/
221 | * http://www.diva-portal.org/smash/get/diva2:1259247/FULLTEXT01.pdf
222 | * https://blog.lunatech.com/2019/01/running-the-spark-notebook-on-a-kubernetes-google-cloud-cluster/
223 | * https://github.com/apache/spark/tree/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark
224 | * https://aws.amazon.com/blogs/containers/optimizing-spark-performance-on-kubernetes/
225 | * https://godatadriven.com/blog/spark-kubernetes-argo-helm/
226 | * https://github.com/aws-samples/amazon-eks-apache-spark-etl-sample
227 | * https://itnext.io/migrating-apache-spark-workloads-from-aws-emr-to-kubernetes-463742b49fda
228 | * https://youtu.be/rpN9IsihEKI
--------------------------------------------------------------------------------
/Running Spark on Kubernetes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/Running Spark on Kubernetes.pdf
--------------------------------------------------------------------------------
/applications/argocd/.gitignore:
--------------------------------------------------------------------------------
1 | secrets.yaml
2 |
--------------------------------------------------------------------------------
/applications/argocd/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: argo-cd
3 | repository: https://argoproj.github.io/argo-helm
4 | version: 3.0.0
5 | digest: sha256:efb28e552f304f5f52cb5ea36207d1bc2f2375ce990550a123184277356f6935
6 | generated: "2021-05-12T09:46:08.059224+02:00"
7 |
--------------------------------------------------------------------------------
/applications/argocd/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: argo-cd
3 | version: 3.0.0 # Keeps parity with the official argo-cd chart https://hub.helm.sh/charts/argo/argo-cd
4 | description: Helm chart for custom ArgoCD deployment
5 | dependencies:
6 | - name: argo-cd
7 | version: 3.0.0
8 | repository: https://argoproj.github.io/argo-helm
9 |
--------------------------------------------------------------------------------
/applications/argocd/argocd/argocd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: argocd
5 | namespace: default
6 | spec:
7 | project: default
8 | source:
9 | repoURL: 'https://github.com/empathyco/platform-spark-kubernetes-samples.git'
10 | path: applications/argocd
11 | targetRevision: master
12 | destination:
13 | server: 'https://kubernetes.default.svc'
14 | namespace: default
15 | syncPolicy:
16 | automated:
17 | prune: true
18 | selfHeal: true
19 |
--------------------------------------------------------------------------------
/applications/argocd/charts/argo-cd-3.0.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/applications/argocd/charts/argo-cd-3.0.0.tgz
--------------------------------------------------------------------------------
/applications/argocd/readme.md:
--------------------------------------------------------------------------------
1 | ## Pre installation
2 | Create k8s secrets with github and Google credentials
3 |
4 | ## Installation
5 |
6 | `helm dependency update` to download official argo chart into `charts/`
7 |
8 |
9 |
10 | `helm install argocd . -f values.yaml -f secrets.yaml`
11 |
12 |
13 | ## Disable Auth
14 | `kubectl patch deploy argocd-server -n argocd -p '[{"op": "add", "path": "/spec/template/spec/containers/0/command/-", "value": "--disable-auth"}]' --type json`
--------------------------------------------------------------------------------
/applications/argocd/values.yaml:
--------------------------------------------------------------------------------
1 | argo-cd:
2 | ## ArgoCD configuration
3 | ## Ref: https://github.com/argoproj/argo-cd
4 | ##
5 |
6 | # Optional CRD installation for those without Helm hooks
7 | installCRDs: true
8 | server:
9 | extraArgs:
10 | - --insecure
11 | ingress:
12 | enabled: true
13 | hosts:
14 | - argo.localhost
15 | paths:
16 | - /
17 | ## Use init containers to configure custom tooling
18 | ## https://argoproj.github.io/argo-cd/operator-manual/custom_tools/
19 | ## When using the volumes & volumeMounts section bellow, please comment out those above.
20 |
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: argo
3 | repository: https://argoproj.github.io/argo-helm
4 | version: 0.16.8
5 | digest: sha256:b3e49a9cba3739971b47a09b9df5976e23958f9013327f81ca40d6b9ca199ec4
6 | generated: "2021-05-18T12:44:16.687348+02:00"
7 |
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: argo-workflows
3 | version: 0.16.8 # Keeps parity with the official argo-workflows chart https://artifacthub.io/packages/helm/argo/argo
4 | description: Helm chart for Argo Workflows deployment
5 | dependencies:
6 | - name: argo
7 | version: 0.16.8
8 | repository: https://argoproj.github.io/argo-helm
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/charts/argo-0.16.8.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/applications/platform/argo-workflows/charts/argo-0.16.8.tgz
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/dashboards/argo-workflows.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": 13927,
17 | "graphTooltip": 0,
18 | "iteration": 1621255594521,
19 | "links": [],
20 | "panels": [
21 | {
22 | "collapsed": false,
23 | "datasource": null,
24 | "gridPos": {
25 | "h": 1,
26 | "w": 24,
27 | "x": 0,
28 | "y": 0
29 | },
30 | "id": 28,
31 | "panels": [],
32 | "title": "Currently",
33 | "type": "row"
34 | },
35 | {
36 | "datasource": null,
37 | "fieldConfig": {
38 | "defaults": {
39 | "custom": {}
40 | },
41 | "overrides": []
42 | },
43 | "gridPos": {
44 | "h": 5,
45 | "w": 2,
46 | "x": 0,
47 | "y": 1
48 | },
49 | "id": 5,
50 | "options": {
51 | "content": "
",
52 | "mode": "html"
53 | },
54 | "pluginVersion": "7.1.0",
55 | "targets": [
56 | {
57 | "queryType": "randomWalk",
58 | "refId": "A"
59 | }
60 | ],
61 | "timeFrom": null,
62 | "timeShift": null,
63 | "title": "",
64 | "type": "text"
65 | },
66 | {
67 | "datasource": "Prometheus",
68 | "fieldConfig": {
69 | "defaults": {
70 | "color": {
71 | "mode": "thresholds"
72 | },
73 | "custom": {},
74 | "mappings": [],
75 | "thresholds": {
76 | "mode": "absolute",
77 | "steps": [
78 | {
79 | "color": "green",
80 | "value": null
81 | },
82 | {
83 | "color": "orange",
84 | "value": 1
85 | }
86 | ]
87 | }
88 | },
89 | "overrides": []
90 | },
91 | "gridPos": {
92 | "h": 6,
93 | "w": 3,
94 | "x": 2,
95 | "y": 1
96 | },
97 | "id": 10,
98 | "options": {
99 | "colorMode": "value",
100 | "graphMode": "area",
101 | "justifyMode": "auto",
102 | "orientation": "auto",
103 | "reduceOptions": {
104 | "calcs": [
105 | "lastNotNull"
106 | ],
107 | "fields": "",
108 | "values": false
109 | },
110 | "text": {},
111 | "textMode": "auto"
112 | },
113 | "pluginVersion": "7.1.5",
114 | "targets": [
115 | {
116 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Pending\"} ",
117 | "instant": true,
118 | "interval": "",
119 | "legendFormat": "",
120 | "queryType": "randomWalk",
121 | "refId": "A"
122 | }
123 | ],
124 | "title": "WF Pending",
125 | "transparent": true,
126 | "type": "stat"
127 | },
128 | {
129 | "datasource": "Prometheus",
130 | "fieldConfig": {
131 | "defaults": {
132 | "color": {
133 | "mode": "thresholds"
134 | },
135 | "custom": {},
136 | "mappings": [],
137 | "thresholds": {
138 | "mode": "absolute",
139 | "steps": [
140 | {
141 | "color": "green",
142 | "value": null
143 | },
144 | {
145 | "color": "#EAB839",
146 | "value": 1
147 | }
148 | ]
149 | }
150 | },
151 | "overrides": []
152 | },
153 | "gridPos": {
154 | "h": 6,
155 | "w": 3,
156 | "x": 5,
157 | "y": 1
158 | },
159 | "id": 11,
160 | "options": {
161 | "colorMode": "value",
162 | "graphMode": "area",
163 | "justifyMode": "auto",
164 | "orientation": "auto",
165 | "reduceOptions": {
166 | "calcs": [
167 | "lastNotNull"
168 | ],
169 | "fields": "",
170 | "values": false
171 | },
172 | "text": {},
173 | "textMode": "auto"
174 | },
175 | "pluginVersion": "7.1.5",
176 | "targets": [
177 | {
178 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Running\"}",
179 | "instant": true,
180 | "interval": "",
181 | "legendFormat": "",
182 | "queryType": "randomWalk",
183 | "refId": "A"
184 | }
185 | ],
186 | "title": "WF Running",
187 | "transparent": true,
188 | "type": "stat"
189 | },
190 | {
191 | "datasource": "Prometheus",
192 | "fieldConfig": {
193 | "defaults": {
194 | "color": {
195 | "mode": "thresholds"
196 | },
197 | "custom": {},
198 | "mappings": [],
199 | "thresholds": {
200 | "mode": "absolute",
201 | "steps": [
202 | {
203 | "color": "green",
204 | "value": null
205 | },
206 | {
207 | "color": "red",
208 | "value": 1
209 | }
210 | ]
211 | }
212 | },
213 | "overrides": []
214 | },
215 | "gridPos": {
216 | "h": 6,
217 | "w": 3,
218 | "x": 8,
219 | "y": 1
220 | },
221 | "id": 8,
222 | "options": {
223 | "colorMode": "value",
224 | "graphMode": "area",
225 | "justifyMode": "auto",
226 | "orientation": "auto",
227 | "reduceOptions": {
228 | "calcs": [
229 | "lastNotNull"
230 | ],
231 | "fields": "",
232 | "values": false
233 | },
234 | "text": {},
235 | "textMode": "auto"
236 | },
237 | "pluginVersion": "7.1.5",
238 | "targets": [
239 | {
240 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Error\"}",
241 | "instant": true,
242 | "interval": "",
243 | "legendFormat": "",
244 | "queryType": "randomWalk",
245 | "refId": "A"
246 | }
247 | ],
248 | "title": "WF Errors",
249 | "transparent": true,
250 | "type": "stat"
251 | },
252 | {
253 | "datasource": "Prometheus",
254 | "fieldConfig": {
255 | "defaults": {
256 | "color": {
257 | "mode": "thresholds"
258 | },
259 | "custom": {},
260 | "mappings": [],
261 | "thresholds": {
262 | "mode": "absolute",
263 | "steps": [
264 | {
265 | "color": "green",
266 | "value": null
267 | },
268 | {
269 | "color": "red",
270 | "value": 1
271 | }
272 | ]
273 | }
274 | },
275 | "overrides": []
276 | },
277 | "gridPos": {
278 | "h": 6,
279 | "w": 3,
280 | "x": 11,
281 | "y": 1
282 | },
283 | "id": 9,
284 | "options": {
285 | "colorMode": "value",
286 | "graphMode": "area",
287 | "justifyMode": "auto",
288 | "orientation": "auto",
289 | "reduceOptions": {
290 | "calcs": [
291 | "lastNotNull"
292 | ],
293 | "fields": "",
294 | "values": false
295 | },
296 | "text": {},
297 | "textMode": "auto"
298 | },
299 | "pluginVersion": "7.1.5",
300 | "targets": [
301 | {
302 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Failed\"}",
303 | "instant": true,
304 | "interval": "",
305 | "legendFormat": "",
306 | "queryType": "randomWalk",
307 | "refId": "A"
308 | }
309 | ],
310 | "title": "WF Failed",
311 | "transparent": true,
312 | "type": "stat"
313 | },
314 | {
315 | "datasource": "Prometheus",
316 | "fieldConfig": {
317 | "defaults": {
318 | "color": {
319 | "mode": "thresholds"
320 | },
321 | "custom": {},
322 | "mappings": [],
323 | "thresholds": {
324 | "mode": "absolute",
325 | "steps": [
326 | {
327 | "color": "green",
328 | "value": null
329 | },
330 | {
331 | "color": "red",
332 | "value": 80
333 | }
334 | ]
335 | }
336 | },
337 | "overrides": []
338 | },
339 | "gridPos": {
340 | "h": 6,
341 | "w": 3,
342 | "x": 14,
343 | "y": 1
344 | },
345 | "id": 12,
346 | "options": {
347 | "colorMode": "value",
348 | "graphMode": "area",
349 | "justifyMode": "auto",
350 | "orientation": "auto",
351 | "reduceOptions": {
352 | "calcs": [
353 | "lastNotNull"
354 | ],
355 | "fields": "",
356 | "values": false
357 | },
358 | "text": {},
359 | "textMode": "auto"
360 | },
361 | "pluginVersion": "7.1.5",
362 | "targets": [
363 | {
364 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Skipped\"}",
365 | "instant": true,
366 | "interval": "",
367 | "legendFormat": "",
368 | "queryType": "randomWalk",
369 | "refId": "A"
370 | }
371 | ],
372 | "title": "WF Skipped",
373 | "transparent": true,
374 | "type": "stat"
375 | },
376 | {
377 | "datasource": "Prometheus",
378 | "fieldConfig": {
379 | "defaults": {
380 | "color": {
381 | "mode": "thresholds"
382 | },
383 | "custom": {},
384 | "mappings": [],
385 | "thresholds": {
386 | "mode": "absolute",
387 | "steps": [
388 | {
389 | "color": "green",
390 | "value": null
391 | }
392 | ]
393 | }
394 | },
395 | "overrides": []
396 | },
397 | "gridPos": {
398 | "h": 6,
399 | "w": 3,
400 | "x": 17,
401 | "y": 1
402 | },
403 | "id": 13,
404 | "options": {
405 | "colorMode": "value",
406 | "graphMode": "area",
407 | "justifyMode": "auto",
408 | "orientation": "auto",
409 | "reduceOptions": {
410 | "calcs": [
411 | "lastNotNull"
412 | ],
413 | "fields": "",
414 | "values": false
415 | },
416 | "text": {},
417 | "textMode": "auto"
418 | },
419 | "pluginVersion": "7.1.5",
420 | "targets": [
421 | {
422 | "expr": "argo_workflows_count{namespace=~\"$ns\",status=\"Succeeded\"}",
423 | "instant": true,
424 | "interval": "",
425 | "legendFormat": "",
426 | "queryType": "randomWalk",
427 | "refId": "A"
428 | }
429 | ],
430 | "title": "WF Succeeded",
431 | "transparent": true,
432 | "type": "stat"
433 | },
434 | {
435 | "collapsed": false,
436 | "datasource": null,
437 | "gridPos": {
438 | "h": 1,
439 | "w": 24,
440 | "x": 0,
441 | "y": 7
442 | },
443 | "id": 18,
444 | "panels": [],
445 | "title": "Number of Workflows currently accessible by the controller by status",
446 | "type": "row"
447 | },
448 | {
449 | "aliasColors": {},
450 | "bars": false,
451 | "dashLength": 10,
452 | "dashes": false,
453 | "datasource": "Prometheus",
454 | "decimals": 0,
455 | "fieldConfig": {
456 | "defaults": {
457 | "custom": {}
458 | },
459 | "overrides": []
460 | },
461 | "fill": 0,
462 | "fillGradient": 1,
463 | "gridPos": {
464 | "h": 9,
465 | "w": 12,
466 | "x": 0,
467 | "y": 8
468 | },
469 | "hiddenSeries": false,
470 | "id": 2,
471 | "legend": {
472 | "alignAsTable": true,
473 | "avg": false,
474 | "current": true,
475 | "hideEmpty": true,
476 | "max": true,
477 | "min": false,
478 | "rightSide": false,
479 | "show": true,
480 | "sort": "current",
481 | "sortDesc": true,
482 | "total": false,
483 | "values": true
484 | },
485 | "lines": true,
486 | "linewidth": 2,
487 | "nullPointMode": "null",
488 | "options": {
489 | "alertThreshold": true
490 | },
491 | "percentage": false,
492 | "pluginVersion": "7.1.5",
493 | "pointradius": 2,
494 | "points": false,
495 | "renderer": "flot",
496 | "seriesOverrides": [],
497 | "spaceLength": 10,
498 | "stack": false,
499 | "steppedLine": false,
500 | "targets": [
501 | {
502 | "expr": "argo_workflows_count{status!~\"(Error|Failed)\",namespace=~\"$ns\"}",
503 | "interval": "",
504 | "legendFormat": "{{namespace}} : {{status}}",
505 | "queryType": "randomWalk",
506 | "refId": "A"
507 | }
508 | ],
509 | "thresholds": [],
510 | "timeFrom": null,
511 | "timeRegions": [],
512 | "timeShift": null,
513 | "title": "Workflow Status ",
514 | "tooltip": {
515 | "shared": false,
516 | "sort": 2,
517 | "value_type": "individual"
518 | },
519 | "transparent": true,
520 | "type": "graph",
521 | "xaxis": {
522 | "buckets": null,
523 | "mode": "time",
524 | "name": null,
525 | "show": true,
526 | "values": []
527 | },
528 | "yaxes": [
529 | {
530 | "decimals": 0,
531 | "format": "short",
532 | "label": null,
533 | "logBase": 1,
534 | "max": null,
535 | "min": null,
536 | "show": true
537 | },
538 | {
539 | "decimals": null,
540 | "format": "short",
541 | "label": null,
542 | "logBase": 1,
543 | "max": null,
544 | "min": null,
545 | "show": true
546 | }
547 | ],
548 | "yaxis": {
549 | "align": false,
550 | "alignLevel": null
551 | }
552 | },
553 | {
554 | "alert": {
555 | "alertRuleTags": {},
556 | "conditions": [
557 | {
558 | "evaluator": {
559 | "params": [
560 | 0
561 | ],
562 | "type": "gt"
563 | },
564 | "operator": {
565 | "type": "and"
566 | },
567 | "query": {
568 | "params": [
569 | "A",
570 | "5m",
571 | "now"
572 | ]
573 | },
574 | "reducer": {
575 | "params": [],
576 | "type": "last"
577 | },
578 | "type": "query"
579 | }
580 | ],
581 | "executionErrorState": "alerting",
582 | "for": "5m",
583 | "frequency": "1m",
584 | "handler": 1,
585 | "name": "argowf failures",
586 | "noDataState": "no_data",
587 | "notifications": []
588 | },
589 | "aliasColors": {},
590 | "bars": false,
591 | "dashLength": 10,
592 | "dashes": false,
593 | "datasource": "Prometheus",
594 | "decimals": 0,
595 | "fieldConfig": {
596 | "defaults": {
597 | "custom": {}
598 | },
599 | "overrides": []
600 | },
601 | "fill": 0,
602 | "fillGradient": 1,
603 | "gridPos": {
604 | "h": 9,
605 | "w": 12,
606 | "x": 12,
607 | "y": 8
608 | },
609 | "hiddenSeries": false,
610 | "id": 29,
611 | "legend": {
612 | "alignAsTable": true,
613 | "avg": false,
614 | "current": true,
615 | "hideEmpty": true,
616 | "max": true,
617 | "min": false,
618 | "rightSide": false,
619 | "show": true,
620 | "sort": "current",
621 | "sortDesc": true,
622 | "total": false,
623 | "values": true
624 | },
625 | "lines": true,
626 | "linewidth": 2,
627 | "nullPointMode": "null",
628 | "options": {
629 | "alertThreshold": true
630 | },
631 | "percentage": false,
632 | "pluginVersion": "7.1.5",
633 | "pointradius": 2,
634 | "points": false,
635 | "renderer": "flot",
636 | "seriesOverrides": [],
637 | "spaceLength": 10,
638 | "stack": false,
639 | "steppedLine": false,
640 | "targets": [
641 | {
642 | "expr": "argo_workflows_count{status=~\"(Error|Failed)\"}",
643 | "interval": "",
644 | "legendFormat": "{{namespace}} : {{status}}",
645 | "queryType": "randomWalk",
646 | "refId": "A"
647 | }
648 | ],
649 | "thresholds": [
650 | {
651 | "colorMode": "critical",
652 | "fill": true,
653 | "line": true,
654 | "op": "gt",
655 | "value": 0,
656 | "visible": true
657 | }
658 | ],
659 | "timeFrom": null,
660 | "timeRegions": [],
661 | "timeShift": null,
662 | "title": "Workflow Errors Alerting ",
663 | "tooltip": {
664 | "shared": false,
665 | "sort": 2,
666 | "value_type": "individual"
667 | },
668 | "transparent": true,
669 | "type": "graph",
670 | "xaxis": {
671 | "buckets": null,
672 | "mode": "time",
673 | "name": null,
674 | "show": true,
675 | "values": []
676 | },
677 | "yaxes": [
678 | {
679 | "decimals": 0,
680 | "format": "short",
681 | "label": null,
682 | "logBase": 1,
683 | "max": null,
684 | "min": null,
685 | "show": true
686 | },
687 | {
688 | "decimals": null,
689 | "format": "short",
690 | "label": null,
691 | "logBase": 1,
692 | "max": null,
693 | "min": null,
694 | "show": true
695 | }
696 | ],
697 | "yaxis": {
698 | "align": false,
699 | "alignLevel": null
700 | }
701 | },
702 | {
703 | "collapsed": false,
704 | "datasource": null,
705 | "gridPos": {
706 | "h": 1,
707 | "w": 24,
708 | "x": 0,
709 | "y": 17
710 | },
711 | "id": 20,
712 | "panels": [],
713 | "title": "Histogram of durations of operations",
714 | "type": "row"
715 | },
716 | {
717 | "aliasColors": {},
718 | "bars": false,
719 | "dashLength": 10,
720 | "dashes": false,
721 | "datasource": "Prometheus",
722 | "decimals": 2,
723 | "fieldConfig": {
724 | "defaults": {
725 | "custom": {}
726 | },
727 | "overrides": []
728 | },
729 | "fill": 1,
730 | "fillGradient": 1,
731 | "gridPos": {
732 | "h": 9,
733 | "w": 24,
734 | "x": 0,
735 | "y": 18
736 | },
737 | "hiddenSeries": false,
738 | "id": 14,
739 | "legend": {
740 | "alignAsTable": true,
741 | "avg": true,
742 | "current": true,
743 | "hideEmpty": true,
744 | "max": true,
745 | "min": true,
746 | "rightSide": false,
747 | "show": true,
748 | "sort": "current",
749 | "sortDesc": true,
750 | "total": false,
751 | "values": true
752 | },
753 | "lines": true,
754 | "linewidth": 1,
755 | "nullPointMode": "null",
756 | "options": {
757 | "alertThreshold": true
758 | },
759 | "percentage": false,
760 | "pluginVersion": "7.1.5",
761 | "pointradius": 2,
762 | "points": true,
763 | "renderer": "flot",
764 | "seriesOverrides": [],
765 | "spaceLength": 10,
766 | "stack": false,
767 | "steppedLine": false,
768 | "targets": [
769 | {
770 | "expr": "histogram_quantile(0.95, sum(rate(argo_workflows_operation_duration_seconds_bucket{namespace=~\"^$ns$\"}[5m])) by (le)) ",
771 | "interval": "",
772 | "legendFormat": "95th percentile",
773 | "refId": "B"
774 | }
775 | ],
776 | "thresholds": [],
777 | "timeFrom": null,
778 | "timeRegions": [],
779 | "timeShift": null,
780 | "title": "Workflow operation duration",
781 | "tooltip": {
782 | "shared": false,
783 | "sort": 2,
784 | "value_type": "individual"
785 | },
786 | "transparent": true,
787 | "type": "graph",
788 | "xaxis": {
789 | "buckets": null,
790 | "mode": "time",
791 | "name": null,
792 | "show": true,
793 | "values": []
794 | },
795 | "yaxes": [
796 | {
797 | "decimals": 2,
798 | "format": "s",
799 | "label": null,
800 | "logBase": 1,
801 | "max": null,
802 | "min": null,
803 | "show": true
804 | },
805 | {
806 | "decimals": null,
807 | "format": "short",
808 | "label": null,
809 | "logBase": 1,
810 | "max": null,
811 | "min": null,
812 | "show": true
813 | }
814 | ],
815 | "yaxis": {
816 | "align": false,
817 | "alignLevel": null
818 | }
819 | },
820 | {
821 | "collapsed": false,
822 | "datasource": null,
823 | "gridPos": {
824 | "h": 1,
825 | "w": 24,
826 | "x": 0,
827 | "y": 27
828 | },
829 | "id": 22,
830 | "panels": [],
831 | "title": "Adds to the queue",
832 | "type": "row"
833 | },
834 | {
835 | "aliasColors": {},
836 | "bars": false,
837 | "dashLength": 10,
838 | "dashes": false,
839 | "datasource": "Prometheus",
840 | "decimals": 2,
841 | "fieldConfig": {
842 | "defaults": {
843 | "custom": {}
844 | },
845 | "overrides": []
846 | },
847 | "fill": 1,
848 | "fillGradient": 1,
849 | "gridPos": {
850 | "h": 9,
851 | "w": 24,
852 | "x": 0,
853 | "y": 28
854 | },
855 | "hiddenSeries": false,
856 | "id": 15,
857 | "legend": {
858 | "alignAsTable": true,
859 | "avg": false,
860 | "current": true,
861 | "hideEmpty": true,
862 | "max": true,
863 | "min": false,
864 | "rightSide": false,
865 | "show": true,
866 | "sort": "current",
867 | "sortDesc": true,
868 | "total": false,
869 | "values": true
870 | },
871 | "lines": true,
872 | "linewidth": 2,
873 | "nullPointMode": "null",
874 | "options": {
875 | "alertThreshold": true
876 | },
877 | "percentage": false,
878 | "pluginVersion": "7.1.5",
879 | "pointradius": 2,
880 | "points": false,
881 | "renderer": "flot",
882 | "seriesOverrides": [],
883 | "spaceLength": 10,
884 | "stack": false,
885 | "steppedLine": false,
886 | "targets": [
887 | {
888 | "expr": "delta(argo_workflows_queue_adds_count{namespace=~\"$ns\"}[2m])",
889 | "interval": "",
890 | "legendFormat": "{{namespace}} : {{queue_name}}",
891 | "queryType": "randomWalk",
892 | "refId": "A"
893 | }
894 | ],
895 | "thresholds": [],
896 | "timeFrom": null,
897 | "timeRegions": [],
898 | "timeShift": null,
899 | "title": "Workflow queue adds ",
900 | "tooltip": {
901 | "shared": false,
902 | "sort": 2,
903 | "value_type": "individual"
904 | },
905 | "transparent": true,
906 | "type": "graph",
907 | "xaxis": {
908 | "buckets": null,
909 | "mode": "time",
910 | "name": null,
911 | "show": true,
912 | "values": []
913 | },
914 | "yaxes": [
915 | {
916 | "decimals": 2,
917 | "format": "short",
918 | "label": null,
919 | "logBase": 1,
920 | "max": null,
921 | "min": null,
922 | "show": true
923 | },
924 | {
925 | "decimals": null,
926 | "format": "short",
927 | "label": null,
928 | "logBase": 1,
929 | "max": null,
930 | "min": null,
931 | "show": true
932 | }
933 | ],
934 | "yaxis": {
935 | "align": false,
936 | "alignLevel": null
937 | }
938 | },
939 | {
940 | "aliasColors": {},
941 | "bars": false,
942 | "dashLength": 10,
943 | "dashes": false,
944 | "datasource": "Prometheus",
945 | "decimals": 2,
946 | "fieldConfig": {
947 | "defaults": {
948 | "custom": {}
949 | },
950 | "overrides": []
951 | },
952 | "fill": 1,
953 | "fillGradient": 1,
954 | "gridPos": {
955 | "h": 9,
956 | "w": 12,
957 | "x": 0,
958 | "y": 37
959 | },
960 | "hiddenSeries": false,
961 | "id": 16,
962 | "legend": {
963 | "alignAsTable": true,
964 | "avg": false,
965 | "current": true,
966 | "hideEmpty": true,
967 | "max": true,
968 | "min": false,
969 | "rightSide": false,
970 | "show": true,
971 | "sort": "current",
972 | "sortDesc": true,
973 | "total": false,
974 | "values": true
975 | },
976 | "lines": true,
977 | "linewidth": 2,
978 | "nullPointMode": "null",
979 | "options": {
980 | "alertThreshold": true
981 | },
982 | "percentage": false,
983 | "pluginVersion": "7.1.5",
984 | "pointradius": 2,
985 | "points": false,
986 | "renderer": "flot",
987 | "seriesOverrides": [],
988 | "spaceLength": 10,
989 | "stack": false,
990 | "steppedLine": false,
991 | "targets": [
992 | {
993 | "expr": "argo_workflows_queue_depth_count{namespace=~\"$ns\"}",
994 | "interval": "",
995 | "legendFormat": "{{namespace}} : {{queue_name}}",
996 | "queryType": "randomWalk",
997 | "refId": "A"
998 | }
999 | ],
1000 | "thresholds": [],
1001 | "timeFrom": null,
1002 | "timeRegions": [],
1003 | "timeShift": null,
1004 | "title": "Depth of the queue",
1005 | "tooltip": {
1006 | "shared": false,
1007 | "sort": 2,
1008 | "value_type": "individual"
1009 | },
1010 | "transparent": true,
1011 | "type": "graph",
1012 | "xaxis": {
1013 | "buckets": null,
1014 | "mode": "time",
1015 | "name": null,
1016 | "show": true,
1017 | "values": []
1018 | },
1019 | "yaxes": [
1020 | {
1021 | "decimals": 2,
1022 | "format": "short",
1023 | "label": null,
1024 | "logBase": 1,
1025 | "max": null,
1026 | "min": "0",
1027 | "show": true
1028 | },
1029 | {
1030 | "decimals": null,
1031 | "format": "short",
1032 | "label": null,
1033 | "logBase": 1,
1034 | "max": null,
1035 | "min": null,
1036 | "show": true
1037 | }
1038 | ],
1039 | "yaxis": {
1040 | "align": false,
1041 | "alignLevel": null
1042 | }
1043 | },
1044 | {
1045 | "aliasColors": {},
1046 | "bars": false,
1047 | "dashLength": 10,
1048 | "dashes": false,
1049 | "datasource": "Prometheus",
1050 | "decimals": 2,
1051 | "fieldConfig": {
1052 | "defaults": {
1053 | "custom": {}
1054 | },
1055 | "overrides": []
1056 | },
1057 | "fill": 0,
1058 | "fillGradient": 1,
1059 | "gridPos": {
1060 | "h": 9,
1061 | "w": 12,
1062 | "x": 12,
1063 | "y": 37
1064 | },
1065 | "hiddenSeries": false,
1066 | "id": 23,
1067 | "legend": {
1068 | "alignAsTable": true,
1069 | "avg": false,
1070 | "current": true,
1071 | "hideEmpty": true,
1072 | "max": true,
1073 | "min": false,
1074 | "rightSide": false,
1075 | "show": true,
1076 | "sort": "current",
1077 | "sortDesc": true,
1078 | "total": false,
1079 | "values": true
1080 | },
1081 | "lines": true,
1082 | "linewidth": 1,
1083 | "nullPointMode": "null",
1084 | "options": {
1085 | "alertThreshold": true
1086 | },
1087 | "percentage": false,
1088 | "pluginVersion": "7.1.5",
1089 | "pointradius": 2,
1090 | "points": false,
1091 | "renderer": "flot",
1092 | "seriesOverrides": [],
1093 | "spaceLength": 10,
1094 | "stack": false,
1095 | "steppedLine": false,
1096 | "targets": [
1097 | {
1098 | "expr": " rate(argo_workflows_queue_latency_sum{queue_name=\"cron_wf_queue\",namespace=~\"^$ns$\"}[5m])\n/\n rate(argo_workflows_queue_latency_count{queue_name=\"cron_wf_queue\",namespace=~\"^$ns$\"}[5m])",
1099 | "interval": "2m",
1100 | "legendFormat": "{{namespace}} : {{queue_name}}",
1101 | "queryType": "randomWalk",
1102 | "refId": "A"
1103 | },
1104 | {
1105 | "expr": " rate(argo_workflows_queue_latency_sum{queue_name=\"pod_queue\",namespace=~\"^$ns$\"}[5m])\n/\n rate(argo_workflows_queue_latency_count{queue_name=\"pod_queue\",namespace=~\"^$ns$\"}[5m])",
1106 | "interval": "",
1107 | "legendFormat": "{{namespace}} : {{queue_name}}",
1108 | "refId": "B"
1109 | },
1110 | {
1111 | "expr": " rate(argo_workflows_queue_latency_sum{queue_name=\"wf_cron_queue\",namespace=~\"^$ns$\"}[5m])\n/\n rate(argo_workflows_queue_latency_count{queue_name=\"wf_cron_queue\",namespace=~\"^$ns$\"}[5m])",
1112 | "hide": false,
1113 | "interval": "",
1114 | "legendFormat": "{{namespace}} : {{queue_name}}",
1115 | "refId": "C"
1116 | },
1117 | {
1118 | "expr": " rate(argo_workflows_queue_latency_sum{queue_name=\"workflow_queue\",namespace=~\"^$ns$\"}[5m])\n/\n rate(argo_workflows_queue_latency_count{queue_name=\"workflow_queue\",namespace=~\"^$ns$\"}[5m])",
1119 | "hide": false,
1120 | "interval": "",
1121 | "legendFormat": "{{namespace}} : {{queue_name}}",
1122 | "refId": "D"
1123 | }
1124 | ],
1125 | "thresholds": [],
1126 | "timeFrom": null,
1127 | "timeRegions": [],
1128 | "timeShift": null,
1129 | "title": "Time objects spend waiting in the queue",
1130 | "tooltip": {
1131 | "shared": false,
1132 | "sort": 2,
1133 | "value_type": "individual"
1134 | },
1135 | "transparent": true,
1136 | "type": "graph",
1137 | "xaxis": {
1138 | "buckets": null,
1139 | "mode": "time",
1140 | "name": null,
1141 | "show": true,
1142 | "values": []
1143 | },
1144 | "yaxes": [
1145 | {
1146 | "decimals": 2,
1147 | "format": "s",
1148 | "label": "avg",
1149 | "logBase": 1,
1150 | "max": null,
1151 | "min": null,
1152 | "show": true
1153 | },
1154 | {
1155 | "decimals": null,
1156 | "format": "short",
1157 | "label": null,
1158 | "logBase": 1,
1159 | "max": null,
1160 | "min": null,
1161 | "show": true
1162 | }
1163 | ],
1164 | "yaxis": {
1165 | "align": false,
1166 | "alignLevel": null
1167 | }
1168 | },
1169 | {
1170 | "collapsed": false,
1171 | "datasource": null,
1172 | "gridPos": {
1173 | "h": 1,
1174 | "w": 24,
1175 | "x": 0,
1176 | "y": 46
1177 | },
1178 | "id": 25,
1179 | "panels": [],
1180 | "title": "Total number of log messages",
1181 | "type": "row"
1182 | },
1183 | {
1184 | "aliasColors": {},
1185 | "bars": true,
1186 | "dashLength": 10,
1187 | "dashes": false,
1188 | "datasource": "Prometheus",
1189 | "decimals": 2,
1190 | "description": "",
1191 | "fieldConfig": {
1192 | "defaults": {
1193 | "custom": {}
1194 | },
1195 | "overrides": []
1196 | },
1197 | "fill": 1,
1198 | "fillGradient": 1,
1199 | "gridPos": {
1200 | "h": 9,
1201 | "w": 24,
1202 | "x": 0,
1203 | "y": 47
1204 | },
1205 | "hiddenSeries": false,
1206 | "id": 26,
1207 | "legend": {
1208 | "alignAsTable": true,
1209 | "avg": false,
1210 | "current": true,
1211 | "hideEmpty": true,
1212 | "max": true,
1213 | "min": false,
1214 | "rightSide": false,
1215 | "show": true,
1216 | "sort": "current",
1217 | "sortDesc": true,
1218 | "total": false,
1219 | "values": true
1220 | },
1221 | "lines": false,
1222 | "linewidth": 2,
1223 | "nullPointMode": "null",
1224 | "options": {
1225 | "alertThreshold": true
1226 | },
1227 | "percentage": false,
1228 | "pluginVersion": "7.1.5",
1229 | "pointradius": 2,
1230 | "points": false,
1231 | "renderer": "flot",
1232 | "seriesOverrides": [],
1233 | "spaceLength": 10,
1234 | "stack": false,
1235 | "steppedLine": false,
1236 | "targets": [
1237 | {
1238 | "expr": "rate(log_messages{namespace=~\"$ns\"}[2m])",
1239 | "interval": "",
1240 | "legendFormat": "{{namespace}} : {{level}}",
1241 | "queryType": "randomWalk",
1242 | "refId": "A"
1243 | }
1244 | ],
1245 | "thresholds": [],
1246 | "timeFrom": null,
1247 | "timeRegions": [],
1248 | "timeShift": null,
1249 | "title": "Log messages",
1250 | "tooltip": {
1251 | "shared": false,
1252 | "sort": 2,
1253 | "value_type": "individual"
1254 | },
1255 | "transparent": true,
1256 | "type": "graph",
1257 | "xaxis": {
1258 | "buckets": null,
1259 | "mode": "time",
1260 | "name": null,
1261 | "show": true,
1262 | "values": []
1263 | },
1264 | "yaxes": [
1265 | {
1266 | "decimals": 2,
1267 | "format": "short",
1268 | "label": "count/sec",
1269 | "logBase": 1,
1270 | "max": null,
1271 | "min": null,
1272 | "show": true
1273 | },
1274 | {
1275 | "decimals": null,
1276 | "format": "short",
1277 | "label": null,
1278 | "logBase": 1,
1279 | "max": null,
1280 | "min": null,
1281 | "show": true
1282 | }
1283 | ],
1284 | "yaxis": {
1285 | "align": false,
1286 | "alignLevel": null
1287 | }
1288 | }
1289 | ],
1290 | "refresh": "1m",
1291 | "schemaVersion": 26,
1292 | "style": "dark",
1293 | "tags": [],
1294 | "templating": {
1295 | "list": [
1296 | {
1297 | "allValue": null,
1298 | "current": {
1299 | "selected": false,
1300 | "text": "argo",
1301 | "value": "argo"
1302 | },
1303 | "datasource": "Prometheus",
1304 | "definition": "label_values(argo_workflows_count, namespace)",
1305 | "description": "Kubernetes namespace",
1306 | "error": null,
1307 | "hide": 0,
1308 | "includeAll": false,
1309 | "label": "k8s_ns",
1310 | "multi": false,
1311 | "name": "ns",
1312 | "options": [],
1313 | "query": "label_values(argo_workflows_count, namespace)",
1314 | "refresh": 2,
1315 | "regex": "",
1316 | "skipUrlSync": false,
1317 | "sort": 0,
1318 | "tagValuesQuery": "",
1319 | "tags": [],
1320 | "tagsQuery": "",
1321 | "type": "query",
1322 | "useTags": false
1323 | }
1324 | ]
1325 | },
1326 | "time": {
1327 | "from": "now-3h",
1328 | "to": "now"
1329 | },
1330 | "timepicker": {
1331 | "refresh_intervals": [
1332 | "5s",
1333 | "10s",
1334 | "30s",
1335 | "1m",
1336 | "5m",
1337 | "15m",
1338 | "30m",
1339 | "1h",
1340 | "2h",
1341 | "1d"
1342 | ]
1343 | },
1344 | "timezone": "",
1345 | "title": "ArgoWorkflow Metrics",
1346 | "uid": "FtBA16jMz",
1347 | "version": 2
1348 | }
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* vim: set filetype=mustache: */}}
2 | {{/*
3 | Expand the name of the chart.
4 | */}}
5 | {{- define "service.name" -}}
6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
7 | {{- end -}}
8 |
9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "service.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 |
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "service.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 |
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "service.labels" -}}
38 | helm.sh/chart: {{ include "service.chart" . }}
39 | {{ include "service.selectorLabels" . }}
40 | {{- if .Chart.AppVersion }}
41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
42 | {{- end }}
43 | app.kubernetes.io/managed-by: {{ .Release.Service }}
44 | {{- end -}}
45 |
46 | {{/*
47 | Selector labels
48 | */}}
49 | {{- define "service.selectorLabels" -}}
50 | app.kubernetes.io/name: {{ include "service.name" . }}
51 | app.kubernetes.io/instance: {{ .Release.Name }}
52 | {{- end -}}
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/templates/configmap-dashboard.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | {{ (.Files.Glob "dashboards/*.json").AsConfig | indent 2 }}
4 | kind: ConfigMap
5 | metadata:
6 | namespace: monitoring
7 | annotations:
8 | labels:
9 | grafana_dashboard: {{ include "service.fullname" . }}
10 | {{- include "service.labels" . | nindent 4 }}
11 | name: {{ include "service.fullname" . }}-dashboard
--------------------------------------------------------------------------------
/applications/platform/argo-workflows/values.yaml:
--------------------------------------------------------------------------------
1 | argo:
2 | images:
3 | tag: v3.0.1
4 |
5 | installCRD: true
6 |
7 | # https://argoproj.github.io/argo-workflows/workflow-controller-configmap/
8 | controller:
9 | parallelism: 10
10 | workflowDefaults:
11 | spec:
12 | ttlStrategy:
13 | secondsAfterCompletion: 259200
14 | workflowNamespaces:
15 | - default
16 | metricsConfig:
17 | enabled: true
18 | path: /metrics
19 | port: 9090
20 | serviceMonitor:
21 | enabled: true
22 | additionalLabels:
23 | release: prometheus
24 | containerRuntimeExecutor: k8sapi
25 | extraEnv:
26 | - name: LEADER_ELECTION_IDENTITY
27 | valueFrom:
28 | fieldRef:
29 | apiVersion: v1
30 | fieldPath: metadata.name
31 | server:
32 | secure: false
33 | extraArgs:
34 | - --secure=false
35 | ingress:
36 | enabled: true
37 | hosts:
38 | - argo-workflows.localhost
39 |
40 |
41 | useStaticCredentials: false
42 |
43 |
44 |
--------------------------------------------------------------------------------
/applications/platform/monitoring/prometheus/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: kube-prometheus-stack
3 | repository: https://prometheus-community.github.io/helm-charts
4 | version: 15.1.1
5 | digest: sha256:161b57662e2f9a77d638f23ed5ea4a9cb2b7e11b13cc827b32411ee527440115
6 | generated: "2021-05-17T17:59:39.024703+02:00"
7 |
--------------------------------------------------------------------------------
/applications/platform/monitoring/prometheus/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
3 | and Prometheus rules combined with documentation and scripts to provide easy to
4 | operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
5 | Operator.
6 | icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png
7 | type: application
8 | maintainers:
9 | - name: vsliouniaev
10 | - name: bismarck
11 | - email: gianrubio@gmail.com
12 | name: gianrubio
13 | - email: github.gkarthiks@gmail.com
14 | name: gkarthiks
15 | - email: scott@r6by.com
16 | name: scottrigby
17 | - email: miroslav.hadzhiev@gmail.com
18 | name: Xtigyro
19 | name: kube-prometheus-stack
20 | sources:
21 | - https://github.com/prometheus-community/helm-charts
22 | - https://github.com/prometheus-operator/kube-prometheus
23 | version: 15.1.1
24 | appVersion: 0.47.0
25 | kubeVersion: ">=1.16.0-0"
26 | home: https://github.com/prometheus-operator/kube-prometheus
27 | keywords:
28 | - operator
29 | - prometheus
30 | - kube-prometheus
31 | annotations:
32 | artifacthub.io/operator: "true"
33 | artifacthub.io/links: |
34 | - name: Chart Source
35 | url: https://github.com/prometheus-community/helm-charts
36 | - name: Upstream Project
37 | url: https://github.com/prometheus-operator/kube-prometheus
38 | dependencies:
39 | - name: kube-prometheus-stack
40 | version: 15.1.1
41 | repository: https://prometheus-community.github.io/helm-charts
42 |
--------------------------------------------------------------------------------
/applications/platform/monitoring/prometheus/README.md:
--------------------------------------------------------------------------------
1 | # kube-prometheus-stack
2 |
3 | Installs the [kube-prometheus stack](https://github.com/prometheus-operator/kube-prometheus), a collection of Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) using the [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator).
4 |
5 | See the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) README for details about components, dashboards, and alerts.
6 |
7 | _Note: This chart was formerly named `prometheus-operator` chart, now renamed to more clearly reflect that it installs the `kube-prometheus` project stack, within which Prometheus Operator is only one component._
8 |
9 | ## Prerequisites
10 |
11 | - Kubernetes 1.16+
12 | - Helm 3+
13 |
14 | ## Get Repo Info
15 |
16 | ```console
17 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
18 | helm repo add stable https://charts.helm.sh/stable
19 | helm repo update
20 | ```
21 |
22 | _See [helm repo](https://helm.sh/docs/helm/helm_repo/) for command documentation._
23 |
24 | ## Install Chart
25 |
26 | ```console
27 | # Helm
28 | $ helm install [RELEASE_NAME] prometheus-community/kube-prometheus-stack
29 | ```
30 |
31 | _See [configuration](#configuration) below._
32 |
33 | _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documentation._
34 |
35 | ## Dependencies
36 |
37 | By default this chart installs additional, dependent charts:
38 |
39 | - [stable/kube-state-metrics](https://github.com/helm/charts/tree/master/stable/kube-state-metrics)
40 | - [prometheus-community/prometheus-node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter)
41 | - [grafana/grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana)
42 |
43 | To disable dependencies during installation, see [multiple releases](#multiple-releases) below.
44 |
45 | _See [helm dependency](https://helm.sh/docs/helm/helm_dependency/) for command documentation._
46 |
47 | ## Uninstall Chart
48 |
49 | ```console
50 | # Helm
51 | $ helm uninstall [RELEASE_NAME]
52 | ```
53 |
54 | This removes all the Kubernetes components associated with the chart and deletes the release.
55 |
56 | _See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall/) for command documentation._
57 |
58 | CRDs created by this chart are not removed by default and should be manually cleaned up:
59 |
60 | ```console
61 | kubectl delete crd alertmanagerconfigs.monitoring.coreos.com
62 | kubectl delete crd alertmanagers.monitoring.coreos.com
63 | kubectl delete crd podmonitors.monitoring.coreos.com
64 | kubectl delete crd probes.monitoring.coreos.com
65 | kubectl delete crd prometheuses.monitoring.coreos.com
66 | kubectl delete crd prometheusrules.monitoring.coreos.com
67 | kubectl delete crd servicemonitors.monitoring.coreos.com
68 | kubectl delete crd thanosrulers.monitoring.coreos.com
69 | ```
70 |
71 | ## Upgrading Chart
72 |
73 | ```console
74 | # Helm
75 | $ helm upgrade [RELEASE_NAME] prometheus-community/kube-prometheus-stack
76 | ```
77 |
78 | With Helm v3, CRDs created by this chart are not updated by default and should be manually updated.
79 | Consult also the [Helm Documentation on CRDs](https://helm.sh/docs/chart_best_practices/custom_resource_definitions).
80 |
81 | _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documentation._
82 |
83 | ### Upgrading an existing Release to a new major version
84 |
85 | A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an incompatible breaking change needing manual actions.
86 |
87 | ### From 11.x to 12.x
88 |
89 | The chart was migrated to support only helm v3 and later.
90 |
91 | ### From 10.x to 11.x
92 |
93 | Version 11 upgrades prometheus-operator from 0.42.x to 0.43.x. Starting with 0.43.x an additional `AlertmanagerConfigs` CRD is introduced. Helm does not automatically upgrade or install new CRDs on a chart upgrade, so you have to install the CRD manually before updating:
94 |
95 | ```console
96 | kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/release-0.43/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
97 | ```
98 |
99 | Version 11 removes the deprecated tlsProxy via ghostunnel in favor of native TLS support the prometheus-operator gained with v0.39.0.
100 |
101 | ### From 9.x to 10.x
102 |
103 | Version 10 upgrades prometheus-operator from 0.38.x to 0.42.x. Starting with 0.40.x an additional `Probes` CRD is introduced. Helm does not automatically upgrade or install new CRDs on a chart upgrade, so you have to install the CRD manually before updating:
104 |
105 | ```console
106 | kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/release-0.42/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
107 | ```
108 |
109 | ### From 8.x to 9.x
110 |
111 | Version 9 of the helm chart removes the existing `additionalScrapeConfigsExternal` in favour of `additionalScrapeConfigsSecret`. This change lets users specify the secret name and secret key to use for the additional scrape configuration of prometheus. This is useful for users that have prometheus-operator as a subchart and also have a template that creates the additional scrape configuration.
112 |
113 | ### From 7.x to 8.x
114 |
115 | Due to new template functions being used in the rules in version 8.x.x of the chart, an upgrade to Prometheus Operator and Prometheus is necessary in order to support them. First, upgrade to the latest version of 7.x.x
116 |
117 | ```console
118 | helm upgrade [RELEASE_NAME] prometheus-community/kube-prometheus-stack --version 7.5.0
119 | ```
120 |
121 | Then upgrade to 8.x.x
122 |
123 | ```console
124 | helm upgrade [RELEASE_NAME] prometheus-community/kube-prometheus-stack --version [8.x.x]
125 | ```
126 |
127 | Minimal recommended Prometheus version for this chart release is `2.12.x`
128 |
129 | ### From 6.x to 7.x
130 |
131 | Due to a change in grafana subchart, version 7.x.x now requires Helm >= 2.12.0.
132 |
133 | ### From 5.x to 6.x
134 |
135 | Due to a change in deployment labels of kube-state-metrics, the upgrade requires `helm upgrade --force` in order to re-create the deployment. If this is not done an error will occur indicating that the deployment cannot be modified:
136 |
137 | ```console
138 | invalid: spec.selector: Invalid value: v1.LabelSelector{MatchLabels:map[string]string{"app.kubernetes.io/name":"kube-state-metrics"}, MatchExpressions:[]v1.LabelSelectorRequirement(nil)}: field is immutable
139 | ```
140 |
141 | If this error has already been encountered, a `helm history` command can be used to determine which release has worked, then `helm rollback` to the release, then `helm upgrade --force` to this new one
142 |
143 | ## Configuration
144 |
145 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
146 |
147 | ```console
148 | helm show values prometheus-community/kube-prometheus-stack
149 | ```
150 |
151 | You may also `helm show values` on this chart's [dependencies](#dependencies) for additional options.
152 |
153 | ### Multiple releases
154 |
155 | The same chart can be used to run multiple Prometheus instances in the same cluster if required. To achieve this, it is necessary to run only one instance of prometheus-operator and a pair of alertmanager pods for an HA configuration, while all other components need to be disabled. To disable a dependency during installation, set `kubeStateMetrics.enabled`, `nodeExporter.enabled` and `grafana.enabled` to `false`.
156 |
157 | ## Work-Arounds for Known Issues
158 |
159 | ### Running on private GKE clusters
160 |
161 | When Google configure the control plane for private clusters, they automatically configure VPC peering between your Kubernetes cluster’s network and a separate Google managed project. In order to restrict what Google are able to access within your cluster, the firewall rules configured restrict access to your Kubernetes pods. This means that in order to use the webhook component with a GKE private cluster, you must configure an additional firewall rule to allow the GKE control plane access to your webhook pod.
162 |
163 | You can read more information on how to add firewall rules for the GKE control plane nodes in the [GKE docs](https://cloud.google.com/kubernetes-engine/docs/how-to/private-clusters#add_firewall_rules)
164 |
165 | Alternatively, you can disable the hooks by setting `prometheusOperator.admissionWebhooks.enabled=false`.
166 |
167 | ## PrometheusRules Admission Webhooks
168 |
169 | With Prometheus Operator version 0.30+, the core Prometheus Operator pod exposes an endpoint that will integrate with the `validatingwebhookconfiguration` Kubernetes feature to prevent malformed rules from being added to the cluster.
170 |
171 | ### How the Chart Configures the Hooks
172 |
173 | A validating and mutating webhook configuration requires the endpoint to which the request is sent to use TLS. It is possible to set up custom certificates to do this, but in most cases, a self-signed certificate is enough. The setup of this component requires some more complex orchestration when using helm. The steps are created to be idempotent and to allow turning the feature on and off without running into helm quirks.
174 |
175 | 1. A pre-install hook provisions a certificate into the same namespace using a format compatible with provisioning using end-user certificates. If the certificate already exists, the hook exits.
176 | 2. The prometheus operator pod is configured to use a TLS proxy container, which will load that certificate.
177 | 3. Validating and Mutating webhook configurations are created in the cluster, with their failure mode set to Ignore. This allows rules to be created by the same chart at the same time, even though the webhook has not yet been fully set up - it does not have the correct CA field set.
178 | 4. A post-install hook reads the CA from the secret created by step 1 and patches the Validating and Mutating webhook configurations. This process will allow a custom CA provisioned by some other process to also be patched into the webhook configurations. The chosen failure policy is also patched into the webhook configurations
179 |
180 | ### Alternatives
181 |
182 | It should be possible to use [jetstack/cert-manager](https://github.com/jetstack/cert-manager) if a more complete solution is required, but it has not been tested.
183 |
184 | ### Limitations
185 |
186 | Because the operator can only run as a single pod, there is potential for this component failure to cause rule deployment failure. Because this risk is outweighed by the benefit of having validation, the feature is enabled by default.
187 |
188 | ## Developing Prometheus Rules and Grafana Dashboards
189 |
190 | This chart Grafana Dashboards and Prometheus Rules are just a copy from [prometheus-operator/prometheus-operator](https://github.com/prometheus-operator/prometheus-operator) and other sources, synced (with alterations) by scripts in [hack](hack) folder. In order to introduce any changes you need to first [add them to the original repo](https://github.com/prometheus-operator/kube-prometheus/blob/master/docs/developing-prometheus-rules-and-grafana-dashboards.md) and then sync there by scripts.
191 |
192 | ## Further Information
193 |
194 | For more in-depth documentation of configuration options meanings, please see
195 |
196 | - [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator)
197 | - [Prometheus](https://prometheus.io/docs/introduction/overview/)
198 | - [Grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana#grafana-helm-chart)
199 |
200 | ## prometheus.io/scrape
201 |
202 | The prometheus operator does not support annotation-based discovery of services, using the `PodMonitor` or `ServiceMonitor` CRD in its place as they provide far more configuration options.
203 | For information on how to use PodMonitors/ServiceMonitors, please see the documentation on the `prometheus-operator/prometheus-operator` documentation here:
204 |
205 | - [ServiceMonitors](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/getting-started.md#include-servicemonitors)
206 | - [PodMonitors](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/getting-started.md#include-podmonitors)
207 | - [Running Exporters](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/running-exporters.md)
208 |
209 | By default, Prometheus discovers PodMonitors and ServiceMonitors within its namespace, that are labeled with the same release tag as the prometheus-operator release.
210 | Sometimes, you may need to discover custom PodMonitors/ServiceMonitors, for example used to scrape data from third-party applications.
211 | An easy way of doing this, without compromising the default PodMonitors/ServiceMonitors discovery, is allowing Prometheus to discover all PodMonitors/ServiceMonitors within its namespace, without applying label filtering.
212 | To do so, you can set `prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues` and `prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues` to `false`.
213 |
214 | ## Migrating from stable/prometheus-operator chart
215 |
216 | ## Zero downtime
217 |
218 | Since `kube-prometheus-stack` is fully compatible with the `stable/prometheus-operator` chart, a migration without downtime can be achieved.
219 | However, the old name prefix needs to be kept. If you want the new name please follow the step by step guide below (with downtime).
220 |
221 | You can override the name to achieve this:
222 |
223 | ```console
224 | helm upgrade prometheus-operator prometheus-community/kube-prometheus-stack -n monitoring --reuse-values --set nameOverride=prometheus-operator
225 | ```
226 |
227 | **Note**: It is recommended to run this first with `--dry-run --debug`.
228 |
229 | ## Redeploy with new name (downtime)
230 |
231 | If the **prometheus-operator** values are compatible with the new **kube-prometheus-stack** chart, please follow the below steps for migration:
232 |
233 | > The guide presumes that chart is deployed in `monitoring` namespace and the deployments are running there. If in other namespace, please replace the `monitoring` to the deployed namespace.
234 |
235 | 1. Patch the PersistenceVolume created/used by the prometheus-operator chart to `Retain` claim policy:
236 |
237 | ```console
238 | kubectl patch pv/ -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}'
239 | ```
240 |
241 | **Note:** To execute the above command, the user must have a cluster wide permission. Please refer [Kubernetes RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)
242 |
243 | 2. Uninstall the **prometheus-operator** release and delete the existing PersistentVolumeClaim, and verify PV become Released.
244 |
245 | ```console
246 | helm uninstall prometheus-operator -n monitoring
247 | kubectl delete pvc/ -n monitoring
248 | ```
249 |
250 | Additionally, you have to manually remove the remaining `prometheus-operator-kubelet` service.
251 |
252 | ```console
253 | kubectl delete service/prometheus-operator-kubelet -n kube-system
254 | ```
255 |
256 | You can choose to remove all your existing CRDs (ServiceMonitors, Podmonitors, etc.) if you want to.
257 |
258 | 3. Remove current `spec.claimRef` values to change the PV's status from Released to Available.
259 |
260 | ```console
261 | kubectl patch pv/ --type json -p='[{"op": "remove", "path": "/spec/claimRef"}]' -n monitoring
262 | ```
263 |
264 | **Note:** To execute the above command, the user must have a cluster wide permission. Please refer to [Kubernetes RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)
265 |
266 | After these steps, proceed to a fresh **kube-prometheus-stack** installation and make sure the current release of **kube-prometheus-stack** matching the `volumeClaimTemplate` values in the `values.yaml`.
267 |
268 | The binding is done via matching a specific amount of storage requested and with certain access modes.
269 |
270 | For example, if you had storage specified as this with **prometheus-operator**:
271 |
272 | ```yaml
273 | volumeClaimTemplate:
274 | spec:
275 | storageClassName: gp2
276 | accessModes: ["ReadWriteOnce"]
277 | resources:
278 | requests:
279 | storage: 50Gi
280 | ```
281 |
282 | You have to specify matching `volumeClaimTemplate` with 50Gi storage and `ReadWriteOnce` access mode.
283 |
284 | Additionally, you should check the current AZ of your legacy installation's PV, and configure the fresh release to use the same AZ as the old one. If the pods are in a different AZ than the PV, the release will fail to bind the existing one, hence creating a new PV.
285 |
286 | This can be achieved either by specifying the labels through `values.yaml`, e.g. setting `prometheus.prometheusSpec.nodeSelector` to:
287 |
288 | ```yaml
289 | nodeSelector:
290 | failure-domain.beta.kubernetes.io/zone: east-west-1a
291 | ```
292 |
293 | or passing these values as `--set` overrides during installation.
294 |
295 | The new release should now re-attach your previously released PV with its content.
296 |
297 | ## Migrating from coreos/prometheus-operator chart
298 |
299 | The multiple charts have been combined into a single chart that installs prometheus operator, prometheus, alertmanager, grafana as well as the multitude of exporters necessary to monitor a cluster.
300 |
301 | There is no simple and direct migration path between the charts as the changes are extensive and intended to make the chart easier to support.
302 |
303 | The capabilities of the old chart are all available in the new chart, including the ability to run multiple prometheus instances on a single cluster - you will need to disable the parts of the chart you do not wish to deploy.
304 |
305 | You can check out the tickets for this change [here](https://github.com/prometheus-operator/prometheus-operator/issues/592) and [here](https://github.com/helm/charts/pull/6765).
306 |
307 | ### High-level overview of Changes
308 |
309 | #### Added dependencies
310 |
311 | The chart has added 3 [dependencies](#dependencies).
312 |
313 | - Node-Exporter, Kube-State-Metrics: These components are loaded as dependencies into the chart, and are relatively simple components
314 | - Grafana: The Grafana chart is more feature-rich than this chart - it contains a sidecar that is able to load data sources and dashboards from configmaps deployed into the same cluster. For more information check out the [documentation for the chart](https://github.com/helm/charts/tree/master/stable/grafana)
315 |
316 | #### Kubelet Service
317 |
318 | Because the kubelet service has a new name in the chart, make sure to clean up the old kubelet service in the `kube-system` namespace to prevent counting container metrics twice.
319 |
320 | #### Persistent Volumes
321 |
322 | If you would like to keep the data of the current persistent volumes, it should be possible to attach existing volumes to new PVCs and PVs that are created using the conventions in the new chart. For example, in order to use an existing Azure disk for a helm release called `prometheus-migration` the following resources can be created:
323 |
324 | ```yaml
325 | apiVersion: v1
326 | kind: PersistentVolume
327 | metadata:
328 | name: pvc-prometheus-migration-prometheus-0
329 | spec:
330 | accessModes:
331 | - ReadWriteOnce
332 | azureDisk:
333 | cachingMode: None
334 | diskName: pvc-prometheus-migration-prometheus-0
335 | diskURI: /subscriptions/f5125d82-2622-4c50-8d25-3f7ba3e9ac4b/resourceGroups/sample-migration-resource-group/providers/Microsoft.Compute/disks/pvc-prometheus-migration-prometheus-0
336 | fsType: ""
337 | kind: Managed
338 | readOnly: false
339 | capacity:
340 | storage: 1Gi
341 | persistentVolumeReclaimPolicy: Delete
342 | storageClassName: prometheus
343 | volumeMode: Filesystem
344 | ```
345 |
346 | ```yaml
347 | apiVersion: v1
348 | kind: PersistentVolumeClaim
349 | metadata:
350 | labels:
351 | app: prometheus
352 | prometheus: prometheus-migration-prometheus
353 | name: prometheus-prometheus-migration-prometheus-db-prometheus-prometheus-migration-prometheus-0
354 | namespace: monitoring
355 | spec:
356 | accessModes:
357 | - ReadWriteOnce
358 | resources:
359 | requests:
360 | storage: 1Gi
361 | storageClassName: prometheus
362 | volumeMode: Filesystem
363 | volumeName: pvc-prometheus-migration-prometheus-0
364 | ```
365 |
366 | The PVC will take ownership of the PV and when you create a release using a persistent volume claim template it will use the existing PVCs as they match the naming convention used by the chart. For other cloud providers similar approaches can be used.
367 |
368 | #### KubeProxy
369 |
370 | The metrics bind address of kube-proxy is default to `127.0.0.1:10249` that prometheus instances **cannot** access to. You should expose metrics by changing `metricsBindAddress` field value to `0.0.0.0:10249` if you want to collect them.
371 |
372 | Depending on the cluster, the relevant part `config.conf` will be in ConfigMap `kube-system/kube-proxy` or `kube-system/kube-proxy-config`. For example:
373 |
374 | ```console
375 | kubectl -n kube-system edit cm kube-proxy
376 | ```
377 |
378 | ```yaml
379 | apiVersion: v1
380 | data:
381 | config.conf: |-
382 | apiVersion: kubeproxy.config.k8s.io/v1alpha1
383 | kind: KubeProxyConfiguration
384 | # ...
385 | # metricsBindAddress: 127.0.0.1:10249
386 | metricsBindAddress: 0.0.0.0:10249
387 | # ...
388 | kubeconfig.conf: |-
389 | # ...
390 | kind: ConfigMap
391 | metadata:
392 | labels:
393 | app: kube-proxy
394 | name: kube-proxy
395 | namespace: kube-system
396 | ```
397 |
--------------------------------------------------------------------------------
/applications/platform/monitoring/prometheus/charts/kube-prometheus-stack-15.1.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/applications/platform/monitoring/prometheus/charts/kube-prometheus-stack-15.1.1.tgz
--------------------------------------------------------------------------------
/applications/platform/monitoring/prometheus/values.yaml:
--------------------------------------------------------------------------------
1 | kube-prometheus-stack:
2 | # Remove some rules we cannot scrape
3 | defaultRules:
4 | rules:
5 | etcd: false
6 | kubeScheduler: false
7 |
8 | alertmanager:
9 | ingress:
10 | enabled: true
11 | hosts:
12 | - alertmanager.localhost
13 | paths:
14 | - /
15 |
16 | grafana:
17 | enabled: true
18 | ingress:
19 | enabled: true
20 | hosts:
21 | - grafana.localhost
22 | paths:
23 | - /
24 | # Leaving for easy access, but is default values
25 | sidecar:
26 | dashboards:
27 | enabled: true
28 | label: grafana_dashboard
29 | provider:
30 | # allow updating provisioned dashboards from the UI
31 | allowUiUpdates: false
32 | datasources:
33 | enabled: true
34 | defaultDatasourceEnabled: true
35 | label: grafana_datasource
36 |
37 | # Remove some scrapings we cannot perform
38 | kubeControllerManager:
39 | enabled: false
40 | kubeEtcd:
41 | enabled: false
42 | kubeScheduler:
43 | enabled: false
44 | kubeProxy:
45 | enabled: false
46 |
47 | prometheus:
48 | enabled: true
49 | ingress:
50 | enabled: true
51 |
52 | hosts:
53 | - prometheus.localhost
54 | paths:
55 | - /
56 | ## Settings affecting prometheusSpec
57 | ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
58 | ##
59 | prometheusSpec:
60 | # Do not use Helm labels as selectors for prometheusRules
61 | ruleSelectorNilUsesHelmValues: false
62 | # Select prometheusRules with label "release:prometheus"
63 | ruleSelector:
64 | matchLabels:
65 | release: prometheus
66 |
67 | # Do not use Helm labels as selectors for serviceMonitors
68 | serviceMonitorSelectorNilUsesHelmValues: false
69 | # Select serviceMonitors with label "release:prometheus"
70 | serviceMonitorSelector:
71 | matchLabels:
72 | release: prometheus
73 |
74 | # Do not use Helm labels as selectors for podMonitor
75 | podMonitorSelectorNilUsesHelmValues: false
76 | # Select podMonitors with label "release:prometheus"
77 | podMonitorSelector:
78 | matchLabels:
79 | release: prometheus
80 |
81 | evaluationInterval: 1m
82 | scrape_interval: 1m
83 | resources:
84 | limits:
85 | cpu: 1000m
86 | memory: 1000Mi
87 | scrapeInterval: 1m
88 |
89 |
--------------------------------------------------------------------------------
/applications/platform/rbac/argo-workflow-sa.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: argo-workflow-role
5 | roleRef:
6 | apiGroup: rbac.authorization.k8s.io
7 | kind: ClusterRole
8 | name: edit
9 | subjects:
10 | - kind: ServiceAccount
11 | name: default
12 | namespace: default
13 | ---
14 | apiVersion: rbac.authorization.k8s.io/v1
15 | kind: Role
16 | metadata:
17 | namespace: default
18 | name: spark-op-role
19 | rules:
20 | - apiGroups: ["sparkoperator.k8s.io"]
21 | resources: ["sparkapplications"]
22 | verbs: ["*"]
23 | ---
24 | apiVersion: rbac.authorization.k8s.io/v1
25 | kind: RoleBinding
26 | metadata:
27 | name: spark-op-role-binding
28 | namespace: default
29 | subjects:
30 | - kind: ServiceAccount
31 | name: spark
32 | namespace: default
33 | roleRef:
34 | kind: Role
35 | name: spark-op-role
36 | apiGroup: rbac.authorization.k8s.io
--------------------------------------------------------------------------------
/applications/platform/rbac/test-sa.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: spark
5 | ---
6 | apiVersion: rbac.authorization.k8s.io/v1
7 | kind: ClusterRoleBinding
8 | metadata:
9 | name: spark-role
10 | roleRef:
11 | apiGroup: rbac.authorization.k8s.io
12 | kind: ClusterRole
13 | name: edit
14 | subjects:
15 | - kind: ServiceAccount
16 | name: spark
17 | namespace: default
--------------------------------------------------------------------------------
/applications/platform/spark-operator/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: spark-operator
3 | repository: https://googlecloudplatform.github.io/spark-on-k8s-operator
4 | version: 1.1.0
5 | digest: sha256:8fee80373931ff4f666619aa212ff504cfc3cd558f66643556a2bcb2d778de36
6 | generated: "2021-05-11T10:41:18.339255+02:00"
7 |
--------------------------------------------------------------------------------
/applications/platform/spark-operator/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: spark-operator
3 | type: application
4 | # https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/tree/master/charts/spark-operator-chart
5 | version: 1.1.0
6 | dependencies:
7 | - name: spark-operator
8 | version: 1.1.0
9 | repository: https://googlecloudplatform.github.io/spark-on-k8s-operator
10 |
--------------------------------------------------------------------------------
/applications/platform/spark-operator/charts/spark-operator-1.1.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empathyco/platform-spark-kubernetes-samples/2370113d3c76af7725b9476cf1d6f0852925374e/applications/platform/spark-operator/charts/spark-operator-1.1.0.tgz
--------------------------------------------------------------------------------
/applications/platform/spark-operator/dashboards/spark-operator.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": null,
17 | "graphTooltip": 0,
18 | "id": 22,
19 | "iteration": 1623163523224,
20 | "links": [],
21 | "panels": [
22 | {
23 | "collapsed": false,
24 | "datasource": null,
25 | "gridPos": {
26 | "h": 1,
27 | "w": 24,
28 | "x": 0,
29 | "y": 0
30 | },
31 | "id": 24,
32 | "panels": [],
33 | "title": "SparkApplication and Executors",
34 | "type": "row"
35 | },
36 | {
37 | "cacheTimeout": null,
38 | "colorBackground": false,
39 | "colorValue": false,
40 | "colors": [
41 | "#299c46",
42 | "rgba(237, 129, 40, 0.89)",
43 | "#d44a3a"
44 | ],
45 | "datasource": null,
46 | "description": "Total number of SparkApplication submitted by the Operator.",
47 | "fieldConfig": {
48 | "defaults": {},
49 | "overrides": []
50 | },
51 | "format": "none",
52 | "gauge": {
53 | "maxValue": 100,
54 | "minValue": 0,
55 | "show": false,
56 | "thresholdLabels": false,
57 | "thresholdMarkers": true
58 | },
59 | "gridPos": {
60 | "h": 3,
61 | "w": 12,
62 | "x": 0,
63 | "y": 1
64 | },
65 | "id": 12,
66 | "interval": null,
67 | "links": [],
68 | "mappingType": 1,
69 | "mappingTypes": [
70 | {
71 | "name": "value to text",
72 | "value": 1
73 | },
74 | {
75 | "name": "range to text",
76 | "value": 2
77 | }
78 | ],
79 | "maxDataPoints": 100,
80 | "nullPointMode": "connected",
81 | "nullText": null,
82 | "postfix": "",
83 | "postfixFontSize": "50%",
84 | "prefix": "",
85 | "prefixFontSize": "50%",
86 | "rangeMaps": [
87 | {
88 | "from": "null",
89 | "text": "N/A",
90 | "to": "null"
91 | }
92 | ],
93 | "sparkline": {
94 | "fillColor": "rgba(31, 118, 189, 0.18)",
95 | "full": false,
96 | "lineColor": "rgb(31, 120, 193)",
97 | "show": false
98 | },
99 | "tableColumn": "",
100 | "targets": [
101 | {
102 | "expr": "spark_app_submit_count{pod=\"$Spark_Operator_Instance\"}",
103 | "format": "time_series",
104 | "intervalFactor": 1,
105 | "legendFormat": "{{pod}}",
106 | "refId": "A"
107 | }
108 | ],
109 | "thresholds": "",
110 | "timeFrom": null,
111 | "timeShift": null,
112 | "title": "Submitted SparkApplications",
113 | "type": "singlestat",
114 | "valueFontSize": "80%",
115 | "valueMaps": [
116 | {
117 | "op": "=",
118 | "text": "N/A",
119 | "value": "null"
120 | }
121 | ],
122 | "valueName": "current"
123 | },
124 | {
125 | "cacheTimeout": null,
126 | "colorBackground": false,
127 | "colorValue": false,
128 | "colors": [
129 | "#299c46",
130 | "rgba(237, 129, 40, 0.89)",
131 | "#d44a3a"
132 | ],
133 | "datasource": null,
134 | "description": "Total number of SparkApplication which failed to complete.",
135 | "fieldConfig": {
136 | "defaults": {},
137 | "overrides": []
138 | },
139 | "format": "none",
140 | "gauge": {
141 | "maxValue": 100,
142 | "minValue": 0,
143 | "show": false,
144 | "thresholdLabels": false,
145 | "thresholdMarkers": true
146 | },
147 | "gridPos": {
148 | "h": 3,
149 | "w": 12,
150 | "x": 12,
151 | "y": 1
152 | },
153 | "id": 22,
154 | "interval": null,
155 | "links": [],
156 | "mappingType": 1,
157 | "mappingTypes": [
158 | {
159 | "name": "value to text",
160 | "value": 1
161 | },
162 | {
163 | "name": "range to text",
164 | "value": 2
165 | }
166 | ],
167 | "maxDataPoints": 100,
168 | "nullPointMode": "connected",
169 | "nullText": null,
170 | "postfix": "",
171 | "postfixFontSize": "50%",
172 | "prefix": "",
173 | "prefixFontSize": "50%",
174 | "rangeMaps": [
175 | {
176 | "from": "null",
177 | "text": "N/A",
178 | "to": "null"
179 | }
180 | ],
181 | "sparkline": {
182 | "fillColor": "rgba(31, 118, 189, 0.18)",
183 | "full": false,
184 | "lineColor": "rgb(31, 120, 193)",
185 | "show": false
186 | },
187 | "tableColumn": "",
188 | "targets": [
189 | {
190 | "expr": "spark_app_failure_count{pod=\"$Spark_Operator_Instance\"}",
191 | "format": "time_series",
192 | "intervalFactor": 1,
193 | "refId": "A"
194 | }
195 | ],
196 | "thresholds": "1",
197 | "timeFrom": null,
198 | "timeShift": null,
199 | "title": "Failed SparkApplications",
200 | "type": "singlestat",
201 | "valueFontSize": "80%",
202 | "valueMaps": [
203 | {
204 | "op": "=",
205 | "text": "0",
206 | "value": "null"
207 | }
208 | ],
209 | "valueName": "current"
210 | },
211 | {
212 | "cacheTimeout": null,
213 | "colorBackground": false,
214 | "colorValue": false,
215 | "colors": [
216 | "#299c46",
217 | "rgba(237, 129, 40, 0.89)",
218 | "#d44a3a"
219 | ],
220 | "datasource": null,
221 | "description": "Total number of SparkApplication which are currently running.",
222 | "fieldConfig": {
223 | "defaults": {},
224 | "overrides": []
225 | },
226 | "format": "none",
227 | "gauge": {
228 | "maxValue": 100,
229 | "minValue": 0,
230 | "show": false,
231 | "thresholdLabels": false,
232 | "thresholdMarkers": true
233 | },
234 | "gridPos": {
235 | "h": 3,
236 | "w": 12,
237 | "x": 0,
238 | "y": 4
239 | },
240 | "id": 14,
241 | "interval": null,
242 | "links": [],
243 | "mappingType": 1,
244 | "mappingTypes": [
245 | {
246 | "name": "value to text",
247 | "value": 1
248 | },
249 | {
250 | "name": "range to text",
251 | "value": 2
252 | }
253 | ],
254 | "maxDataPoints": 100,
255 | "nullPointMode": "connected",
256 | "nullText": null,
257 | "postfix": "",
258 | "postfixFontSize": "50%",
259 | "prefix": "",
260 | "prefixFontSize": "50%",
261 | "rangeMaps": [
262 | {
263 | "from": "null",
264 | "text": "N/A",
265 | "to": "null"
266 | }
267 | ],
268 | "sparkline": {
269 | "fillColor": "rgba(31, 118, 189, 0.18)",
270 | "full": false,
271 | "lineColor": "rgb(31, 120, 193)",
272 | "show": false
273 | },
274 | "tableColumn": "",
275 | "targets": [
276 | {
277 | "expr": "spark_app_running_count{pod=\"$Spark_Operator_Instance\"}",
278 | "format": "time_series",
279 | "intervalFactor": 1,
280 | "legendFormat": "{{pod}}",
281 | "refId": "A"
282 | }
283 | ],
284 | "thresholds": "",
285 | "timeFrom": null,
286 | "timeShift": null,
287 | "title": "Running SparkApplications",
288 | "type": "singlestat",
289 | "valueFontSize": "80%",
290 | "valueMaps": [
291 | {
292 | "op": "=",
293 | "text": "N/A",
294 | "value": "null"
295 | }
296 | ],
297 | "valueName": "current"
298 | },
299 | {
300 | "cacheTimeout": null,
301 | "colorBackground": false,
302 | "colorPostfix": false,
303 | "colorPrefix": false,
304 | "colorValue": true,
305 | "colors": [
306 | "#299c46",
307 | "rgba(237, 129, 40, 0.89)",
308 | "#d44a3a"
309 | ],
310 | "datasource": null,
311 | "description": "Total number of Spark Executors which failed.",
312 | "fieldConfig": {
313 | "defaults": {},
314 | "overrides": []
315 | },
316 | "format": "none",
317 | "gauge": {
318 | "maxValue": 100,
319 | "minValue": 0,
320 | "show": false,
321 | "thresholdLabels": false,
322 | "thresholdMarkers": true
323 | },
324 | "gridPos": {
325 | "h": 3,
326 | "w": 12,
327 | "x": 12,
328 | "y": 4
329 | },
330 | "id": 20,
331 | "interval": null,
332 | "links": [],
333 | "mappingType": 1,
334 | "mappingTypes": [
335 | {
336 | "name": "value to text",
337 | "value": 1
338 | },
339 | {
340 | "name": "range to text",
341 | "value": 2
342 | }
343 | ],
344 | "maxDataPoints": 100,
345 | "nullPointMode": "connected",
346 | "nullText": null,
347 | "postfix": "",
348 | "postfixFontSize": "50%",
349 | "prefix": "",
350 | "prefixFontSize": "50%",
351 | "rangeMaps": [
352 | {
353 | "from": "null",
354 | "text": "N/A",
355 | "to": "null"
356 | }
357 | ],
358 | "sparkline": {
359 | "fillColor": "rgba(31, 118, 189, 0.18)",
360 | "full": false,
361 | "lineColor": "rgb(31, 120, 193)",
362 | "show": false
363 | },
364 | "tableColumn": "",
365 | "targets": [
366 | {
367 | "expr": "spark_app_executor_failure_count{pod=\"$Spark_Operator_Instance\"}",
368 | "format": "time_series",
369 | "intervalFactor": 1,
370 | "legendFormat": "{{pod}}",
371 | "refId": "A"
372 | }
373 | ],
374 | "thresholds": "1,1",
375 | "timeFrom": null,
376 | "timeShift": null,
377 | "title": "Failed Spark Executors",
378 | "type": "singlestat",
379 | "valueFontSize": "80%",
380 | "valueMaps": [
381 | {
382 | "op": "=",
383 | "text": "0",
384 | "value": "null"
385 | }
386 | ],
387 | "valueName": "current"
388 | },
389 | {
390 | "cacheTimeout": null,
391 | "colorBackground": false,
392 | "colorValue": false,
393 | "colors": [
394 | "#299c46",
395 | "rgba(237, 129, 40, 0.89)",
396 | "#d44a3a"
397 | ],
398 | "datasource": null,
399 | "description": "Total number of SparkApplication which completed successfully.",
400 | "fieldConfig": {
401 | "defaults": {},
402 | "overrides": []
403 | },
404 | "format": "none",
405 | "gauge": {
406 | "maxValue": 100,
407 | "minValue": 0,
408 | "show": false,
409 | "thresholdLabels": false,
410 | "thresholdMarkers": true
411 | },
412 | "gridPos": {
413 | "h": 2,
414 | "w": 12,
415 | "x": 0,
416 | "y": 7
417 | },
418 | "id": 16,
419 | "interval": null,
420 | "links": [],
421 | "mappingType": 1,
422 | "mappingTypes": [
423 | {
424 | "name": "value to text",
425 | "value": 1
426 | },
427 | {
428 | "name": "range to text",
429 | "value": 2
430 | }
431 | ],
432 | "maxDataPoints": 100,
433 | "nullPointMode": "connected",
434 | "nullText": null,
435 | "postfix": "",
436 | "postfixFontSize": "50%",
437 | "prefix": "",
438 | "prefixFontSize": "50%",
439 | "rangeMaps": [
440 | {
441 | "from": "null",
442 | "text": "N/A",
443 | "to": "null"
444 | }
445 | ],
446 | "sparkline": {
447 | "fillColor": "rgba(31, 118, 189, 0.18)",
448 | "full": false,
449 | "lineColor": "rgb(31, 120, 193)",
450 | "show": false
451 | },
452 | "tableColumn": "",
453 | "targets": [
454 | {
455 | "expr": "spark_app_success_count{pod=\"$Spark_Operator_Instance\"}",
456 | "format": "time_series",
457 | "intervalFactor": 1,
458 | "legendFormat": "{{pod}}",
459 | "refId": "A"
460 | }
461 | ],
462 | "thresholds": "1,2",
463 | "timeFrom": null,
464 | "timeShift": null,
465 | "title": "Succ. completed SparkApplications.",
466 | "type": "singlestat",
467 | "valueFontSize": "80%",
468 | "valueMaps": [
469 | {
470 | "op": "=",
471 | "text": "N/A",
472 | "value": "null"
473 | }
474 | ],
475 | "valueName": "current"
476 | },
477 | {
478 | "cacheTimeout": null,
479 | "colorBackground": false,
480 | "colorValue": true,
481 | "colors": [
482 | "#299c46",
483 | "rgba(237, 129, 40, 0.89)",
484 | "#d44a3a"
485 | ],
486 | "datasource": null,
487 | "description": "Total number of Spark Executors which completed successfully.",
488 | "fieldConfig": {
489 | "defaults": {},
490 | "overrides": []
491 | },
492 | "format": "none",
493 | "gauge": {
494 | "maxValue": 100,
495 | "minValue": 0,
496 | "show": false,
497 | "thresholdLabels": false,
498 | "thresholdMarkers": true
499 | },
500 | "gridPos": {
501 | "h": 4,
502 | "w": 12,
503 | "x": 12,
504 | "y": 7
505 | },
506 | "id": 28,
507 | "interval": null,
508 | "links": [],
509 | "mappingType": 1,
510 | "mappingTypes": [
511 | {
512 | "name": "value to text",
513 | "value": 1
514 | },
515 | {
516 | "name": "range to text",
517 | "value": 2
518 | }
519 | ],
520 | "maxDataPoints": 100,
521 | "nullPointMode": "connected",
522 | "nullText": null,
523 | "postfix": "",
524 | "postfixFontSize": "50%",
525 | "prefix": "",
526 | "prefixFontSize": "50%",
527 | "rangeMaps": [
528 | {
529 | "from": "null",
530 | "text": "N/A",
531 | "to": "null"
532 | }
533 | ],
534 | "sparkline": {
535 | "fillColor": "rgba(31, 118, 189, 0.18)",
536 | "full": false,
537 | "lineColor": "rgb(31, 120, 193)",
538 | "show": false
539 | },
540 | "tableColumn": "",
541 | "targets": [
542 | {
543 | "expr": "spark_app_executor_success_count{pod=\"$Spark_Operator_Instance\"}",
544 | "format": "time_series",
545 | "intervalFactor": 1,
546 | "refId": "A"
547 | }
548 | ],
549 | "thresholds": "",
550 | "timeFrom": null,
551 | "timeShift": null,
552 | "title": "Succ. Completed Spark Executors",
553 | "type": "singlestat",
554 | "valueFontSize": "80%",
555 | "valueMaps": [
556 | {
557 | "op": "=",
558 | "text": "0",
559 | "value": "null"
560 | }
561 | ],
562 | "valueName": "current"
563 | },
564 | {
565 | "cacheTimeout": null,
566 | "colorBackground": false,
567 | "colorValue": false,
568 | "colors": [
569 | "#299c46",
570 | "rgba(237, 129, 40, 0.89)",
571 | "#d44a3a"
572 | ],
573 | "datasource": null,
574 | "description": "Total number of Spark Executors which are currently running.",
575 | "fieldConfig": {
576 | "defaults": {},
577 | "overrides": []
578 | },
579 | "format": "none",
580 | "gauge": {
581 | "maxValue": 100,
582 | "minValue": 0,
583 | "show": false,
584 | "thresholdLabels": false,
585 | "thresholdMarkers": true
586 | },
587 | "gridPos": {
588 | "h": 2,
589 | "w": 12,
590 | "x": 0,
591 | "y": 9
592 | },
593 | "id": 26,
594 | "interval": null,
595 | "links": [],
596 | "mappingType": 1,
597 | "mappingTypes": [
598 | {
599 | "name": "value to text",
600 | "value": 1
601 | },
602 | {
603 | "name": "range to text",
604 | "value": 2
605 | }
606 | ],
607 | "maxDataPoints": 100,
608 | "nullPointMode": "connected",
609 | "nullText": null,
610 | "postfix": "",
611 | "postfixFontSize": "50%",
612 | "prefix": "",
613 | "prefixFontSize": "50%",
614 | "rangeMaps": [
615 | {
616 | "from": "null",
617 | "text": "N/A",
618 | "to": "null"
619 | }
620 | ],
621 | "sparkline": {
622 | "fillColor": "rgba(31, 118, 189, 0.18)",
623 | "full": false,
624 | "lineColor": "rgb(31, 120, 193)",
625 | "show": false
626 | },
627 | "tableColumn": "",
628 | "targets": [
629 | {
630 | "expr": "spark_app_executor_running_count{pod=\"$Spark_Operator_Instance\"}",
631 | "format": "time_series",
632 | "intervalFactor": 1,
633 | "refId": "A"
634 | }
635 | ],
636 | "thresholds": "",
637 | "timeFrom": null,
638 | "timeShift": null,
639 | "title": "Running Spark Executors",
640 | "type": "singlestat",
641 | "valueFontSize": "80%",
642 | "valueMaps": [
643 | {
644 | "op": "=",
645 | "text": "N/A",
646 | "value": "null"
647 | }
648 | ],
649 | "valueName": "current"
650 | },
651 | {
652 | "collapsed": false,
653 | "datasource": null,
654 | "gridPos": {
655 | "h": 1,
656 | "w": 24,
657 | "x": 0,
658 | "y": 11
659 | },
660 | "id": 2,
661 | "panels": [],
662 | "repeat": null,
663 | "title": "SparkApplication Work Queue",
664 | "type": "row"
665 | },
666 | {
667 | "cacheTimeout": null,
668 | "colorBackground": false,
669 | "colorValue": false,
670 | "colors": [
671 | "#299c46",
672 | "rgba(237, 129, 40, 0.89)",
673 | "#d44a3a"
674 | ],
675 | "datasource": "Prometheus",
676 | "description": "Total number of adds handled by work queue",
677 | "fieldConfig": {
678 | "defaults": {},
679 | "overrides": []
680 | },
681 | "format": "none",
682 | "gauge": {
683 | "maxValue": 100,
684 | "minValue": 0,
685 | "show": false,
686 | "thresholdLabels": false,
687 | "thresholdMarkers": true
688 | },
689 | "gridPos": {
690 | "h": 2,
691 | "w": 12,
692 | "x": 0,
693 | "y": 12
694 | },
695 | "id": 4,
696 | "interval": null,
697 | "links": [],
698 | "mappingType": 1,
699 | "mappingTypes": [
700 | {
701 | "name": "value to text",
702 | "value": 1
703 | },
704 | {
705 | "name": "range to text",
706 | "value": 2
707 | }
708 | ],
709 | "maxDataPoints": 100,
710 | "nullPointMode": "connected",
711 | "nullText": null,
712 | "pluginVersion": "6.1.6",
713 | "postfix": "",
714 | "postfixFontSize": "50%",
715 | "prefix": "",
716 | "prefixFontSize": "50%",
717 | "rangeMaps": [
718 | {
719 | "from": "null",
720 | "text": "N/A",
721 | "to": "null"
722 | }
723 | ],
724 | "sparkline": {
725 | "fillColor": "rgba(31, 118, 189, 0.18)",
726 | "full": false,
727 | "lineColor": "rgb(31, 120, 193)",
728 | "show": false
729 | },
730 | "tableColumn": "",
731 | "targets": [
732 | {
733 | "expr": "spark_application_controller_adds{pod=\"$Spark_Operator_Instance\"}",
734 | "format": "time_series",
735 | "intervalFactor": 1,
736 | "legendFormat": "{{pod}}",
737 | "refId": "A"
738 | }
739 | ],
740 | "thresholds": "",
741 | "timeFrom": null,
742 | "timeShift": null,
743 | "title": "Adds handled by work queue",
744 | "type": "singlestat",
745 | "valueFontSize": "80%",
746 | "valueMaps": [
747 | {
748 | "op": "=",
749 | "text": "N/A",
750 | "value": "null"
751 | }
752 | ],
753 | "valueName": "current"
754 | },
755 | {
756 | "datasource": null,
757 | "description": "Current depth of work queue",
758 | "fieldConfig": {
759 | "defaults": {
760 | "color": {
761 | "mode": "thresholds"
762 | },
763 | "decimals": null,
764 | "mappings": [],
765 | "max": 100,
766 | "min": 0,
767 | "thresholds": {
768 | "mode": "absolute",
769 | "steps": [
770 | {
771 | "color": "green",
772 | "index": 0,
773 | "value": null
774 | },
775 | {
776 | "color": "red",
777 | "index": 1,
778 | "value": 80
779 | }
780 | ]
781 | },
782 | "unit": "none"
783 | },
784 | "overrides": []
785 | },
786 | "gridPos": {
787 | "h": 4,
788 | "w": 12,
789 | "x": 12,
790 | "y": 12
791 | },
792 | "id": 6,
793 | "links": [],
794 | "options": {
795 | "orientation": "auto",
796 | "reduceOptions": {
797 | "calcs": [
798 | "last"
799 | ],
800 | "fields": "",
801 | "values": false
802 | },
803 | "showThresholdLabels": false,
804 | "showThresholdMarkers": true,
805 | "text": {}
806 | },
807 | "pluginVersion": "7.5.3",
808 | "targets": [
809 | {
810 | "expr": "spark_application_controller_depth{pod=\"$Spark_Operator_Instance\"}",
811 | "format": "time_series",
812 | "intervalFactor": 1,
813 | "legendFormat": "",
814 | "refId": "A"
815 | }
816 | ],
817 | "timeFrom": null,
818 | "timeShift": null,
819 | "title": "Current depth of work queue",
820 | "type": "gauge"
821 | },
822 | {
823 | "cacheTimeout": null,
824 | "colorBackground": false,
825 | "colorValue": false,
826 | "colors": [
827 | "#299c46",
828 | "rgba(237, 129, 40, 0.89)",
829 | "#d44a3a"
830 | ],
831 | "datasource": null,
832 | "description": "Total number of retries handled by work queue",
833 | "fieldConfig": {
834 | "defaults": {},
835 | "overrides": []
836 | },
837 | "format": "none",
838 | "gauge": {
839 | "maxValue": 100,
840 | "minValue": 0,
841 | "show": false,
842 | "thresholdLabels": false,
843 | "thresholdMarkers": true
844 | },
845 | "gridPos": {
846 | "h": 2,
847 | "w": 12,
848 | "x": 0,
849 | "y": 14
850 | },
851 | "id": 31,
852 | "interval": null,
853 | "links": [],
854 | "mappingType": 1,
855 | "mappingTypes": [
856 | {
857 | "name": "value to text",
858 | "value": 1
859 | },
860 | {
861 | "name": "range to text",
862 | "value": 2
863 | }
864 | ],
865 | "maxDataPoints": 100,
866 | "nullPointMode": "connected",
867 | "nullText": null,
868 | "postfix": "",
869 | "postfixFontSize": "50%",
870 | "prefix": "",
871 | "prefixFontSize": "50%",
872 | "rangeMaps": [
873 | {
874 | "from": "null",
875 | "text": "N/A",
876 | "to": "null"
877 | }
878 | ],
879 | "sparkline": {
880 | "fillColor": "rgba(31, 118, 189, 0.18)",
881 | "full": false,
882 | "lineColor": "rgb(31, 120, 193)",
883 | "show": false
884 | },
885 | "tableColumn": "",
886 | "targets": [
887 | {
888 | "expr": "spark_application_controller_retries{pod=\"$Spark_Operator_Instance\"}",
889 | "format": "time_series",
890 | "intervalFactor": 1,
891 | "refId": "A"
892 | }
893 | ],
894 | "thresholds": "",
895 | "timeFrom": null,
896 | "timeShift": null,
897 | "title": "Retries handled by work queue",
898 | "type": "singlestat",
899 | "valueFontSize": "80%",
900 | "valueMaps": [
901 | {
902 | "op": "=",
903 | "text": "N/A",
904 | "value": "null"
905 | }
906 | ],
907 | "valueName": "current"
908 | },
909 | {
910 | "cacheTimeout": null,
911 | "colorBackground": false,
912 | "colorValue": false,
913 | "colors": [
914 | "#299c46",
915 | "rgba(237, 129, 40, 0.89)",
916 | "#d44a3a"
917 | ],
918 | "datasource": null,
919 | "description": "Unfinished work in seconds",
920 | "fieldConfig": {
921 | "defaults": {},
922 | "overrides": []
923 | },
924 | "format": "none",
925 | "gauge": {
926 | "maxValue": 100,
927 | "minValue": 0,
928 | "show": false,
929 | "thresholdLabels": false,
930 | "thresholdMarkers": true
931 | },
932 | "gridPos": {
933 | "h": 4,
934 | "w": 12,
935 | "x": 0,
936 | "y": 16
937 | },
938 | "id": 33,
939 | "interval": null,
940 | "links": [],
941 | "mappingType": 1,
942 | "mappingTypes": [
943 | {
944 | "name": "value to text",
945 | "value": 1
946 | },
947 | {
948 | "name": "range to text",
949 | "value": 2
950 | }
951 | ],
952 | "maxDataPoints": 100,
953 | "nullPointMode": "connected",
954 | "nullText": null,
955 | "postfix": " sec",
956 | "postfixFontSize": "50%",
957 | "prefix": "",
958 | "prefixFontSize": "50%",
959 | "rangeMaps": [
960 | {
961 | "from": "null",
962 | "text": "N/A",
963 | "to": "null"
964 | }
965 | ],
966 | "sparkline": {
967 | "fillColor": "rgba(31, 118, 189, 0.18)",
968 | "full": false,
969 | "lineColor": "rgb(31, 120, 193)",
970 | "show": true
971 | },
972 | "tableColumn": "",
973 | "targets": [
974 | {
975 | "expr": "spark_application_controller_unfinished_work_seconds{pod=\"$Spark_Operator_Instance\"}",
976 | "format": "time_series",
977 | "intervalFactor": 1,
978 | "refId": "A"
979 | }
980 | ],
981 | "thresholds": "",
982 | "timeFrom": null,
983 | "timeShift": null,
984 | "title": "Unfinished work",
985 | "type": "singlestat",
986 | "valueFontSize": "80%",
987 | "valueMaps": [
988 | {
989 | "op": "=",
990 | "text": "N/A",
991 | "value": "null"
992 | }
993 | ],
994 | "valueName": "current"
995 | },
996 | {
997 | "cacheTimeout": null,
998 | "colorBackground": false,
999 | "colorValue": false,
1000 | "colors": [
1001 | "#299c46",
1002 | "rgba(237, 129, 40, 0.89)",
1003 | "#d44a3a"
1004 | ],
1005 | "datasource": null,
1006 | "description": "Longest running processor in microseconds",
1007 | "fieldConfig": {
1008 | "defaults": {},
1009 | "overrides": []
1010 | },
1011 | "format": "none",
1012 | "gauge": {
1013 | "maxValue": 100,
1014 | "minValue": 0,
1015 | "show": false,
1016 | "thresholdLabels": false,
1017 | "thresholdMarkers": true
1018 | },
1019 | "gridPos": {
1020 | "h": 4,
1021 | "w": 12,
1022 | "x": 12,
1023 | "y": 16
1024 | },
1025 | "id": 35,
1026 | "interval": null,
1027 | "links": [],
1028 | "mappingType": 1,
1029 | "mappingTypes": [
1030 | {
1031 | "name": "value to text",
1032 | "value": 1
1033 | },
1034 | {
1035 | "name": "range to text",
1036 | "value": 2
1037 | }
1038 | ],
1039 | "maxDataPoints": 100,
1040 | "nullPointMode": "connected",
1041 | "nullText": null,
1042 | "postfix": " µs",
1043 | "postfixFontSize": "50%",
1044 | "prefix": "",
1045 | "prefixFontSize": "50%",
1046 | "rangeMaps": [
1047 | {
1048 | "from": "null",
1049 | "text": "N/A",
1050 | "to": "null"
1051 | }
1052 | ],
1053 | "sparkline": {
1054 | "fillColor": "rgba(31, 118, 189, 0.18)",
1055 | "full": false,
1056 | "lineColor": "rgb(31, 120, 193)",
1057 | "show": false
1058 | },
1059 | "tableColumn": "",
1060 | "targets": [
1061 | {
1062 | "expr": "spark_application_controller_longest_running_processor_microseconds{pod=\"$Spark_Operator_Instance\"}",
1063 | "format": "time_series",
1064 | "intervalFactor": 1,
1065 | "legendFormat": "",
1066 | "refId": "A"
1067 | }
1068 | ],
1069 | "thresholds": "",
1070 | "timeFrom": null,
1071 | "timeShift": null,
1072 | "title": "Longest running processor",
1073 | "type": "singlestat",
1074 | "valueFontSize": "80%",
1075 | "valueMaps": [
1076 | {
1077 | "op": "=",
1078 | "text": "N/A",
1079 | "value": "null"
1080 | }
1081 | ],
1082 | "valueName": "current"
1083 | },
1084 | {
1085 | "collapsed": false,
1086 | "datasource": null,
1087 | "gridPos": {
1088 | "h": 1,
1089 | "w": 24,
1090 | "x": 0,
1091 | "y": 20
1092 | },
1093 | "id": 37,
1094 | "panels": [],
1095 | "title": "ScheduledApplication Work Queue",
1096 | "type": "row"
1097 | },
1098 | {
1099 | "cacheTimeout": null,
1100 | "colorBackground": false,
1101 | "colorValue": false,
1102 | "colors": [
1103 | "#299c46",
1104 | "rgba(237, 129, 40, 0.89)",
1105 | "#d44a3a"
1106 | ],
1107 | "datasource": "Prometheus",
1108 | "description": "Total number of adds handled by workqueue",
1109 | "fieldConfig": {
1110 | "defaults": {},
1111 | "overrides": []
1112 | },
1113 | "format": "none",
1114 | "gauge": {
1115 | "maxValue": 100,
1116 | "minValue": 0,
1117 | "show": false,
1118 | "thresholdLabels": false,
1119 | "thresholdMarkers": true
1120 | },
1121 | "gridPos": {
1122 | "h": 3,
1123 | "w": 12,
1124 | "x": 0,
1125 | "y": 21
1126 | },
1127 | "id": 39,
1128 | "interval": null,
1129 | "links": [],
1130 | "mappingType": 1,
1131 | "mappingTypes": [
1132 | {
1133 | "name": "value to text",
1134 | "value": 1
1135 | },
1136 | {
1137 | "name": "range to text",
1138 | "value": 2
1139 | }
1140 | ],
1141 | "maxDataPoints": 100,
1142 | "nullPointMode": "connected",
1143 | "nullText": null,
1144 | "pluginVersion": "6.1.6",
1145 | "postfix": "",
1146 | "postfixFontSize": "50%",
1147 | "prefix": "",
1148 | "prefixFontSize": "50%",
1149 | "rangeMaps": [
1150 | {
1151 | "from": "null",
1152 | "text": "N/A",
1153 | "to": "null"
1154 | }
1155 | ],
1156 | "sparkline": {
1157 | "fillColor": "rgba(31, 118, 189, 0.18)",
1158 | "full": false,
1159 | "lineColor": "rgb(31, 120, 193)",
1160 | "show": false
1161 | },
1162 | "tableColumn": "",
1163 | "targets": [
1164 | {
1165 | "expr": "scheduled_spark_application_controller_adds{pod=\"$Spark_Operator_Instance\"}",
1166 | "format": "time_series",
1167 | "intervalFactor": 1,
1168 | "legendFormat": "{{pod}}",
1169 | "refId": "A"
1170 | }
1171 | ],
1172 | "thresholds": "",
1173 | "timeFrom": null,
1174 | "timeShift": null,
1175 | "title": "Adds handled by work queue",
1176 | "type": "singlestat",
1177 | "valueFontSize": "80%",
1178 | "valueMaps": [
1179 | {
1180 | "op": "=",
1181 | "text": "N/A",
1182 | "value": "null"
1183 | }
1184 | ],
1185 | "valueName": "current"
1186 | },
1187 | {
1188 | "datasource": null,
1189 | "description": "Current depth of work queue",
1190 | "fieldConfig": {
1191 | "defaults": {
1192 | "color": {
1193 | "mode": "thresholds"
1194 | },
1195 | "decimals": null,
1196 | "mappings": [],
1197 | "max": 100,
1198 | "min": 0,
1199 | "thresholds": {
1200 | "mode": "absolute",
1201 | "steps": [
1202 | {
1203 | "color": "green",
1204 | "index": 0,
1205 | "value": null
1206 | },
1207 | {
1208 | "color": "red",
1209 | "index": 1,
1210 | "value": 80
1211 | }
1212 | ]
1213 | },
1214 | "unit": "none"
1215 | },
1216 | "overrides": []
1217 | },
1218 | "gridPos": {
1219 | "h": 5,
1220 | "w": 12,
1221 | "x": 12,
1222 | "y": 21
1223 | },
1224 | "id": 45,
1225 | "links": [],
1226 | "options": {
1227 | "orientation": "auto",
1228 | "reduceOptions": {
1229 | "calcs": [
1230 | "last"
1231 | ],
1232 | "fields": "",
1233 | "values": false
1234 | },
1235 | "showThresholdLabels": false,
1236 | "showThresholdMarkers": true,
1237 | "text": {}
1238 | },
1239 | "pluginVersion": "7.5.3",
1240 | "targets": [
1241 | {
1242 | "expr": "scheduled_spark_application_controller_depth{pod=\"$Spark_Operator_Instance\"}",
1243 | "format": "time_series",
1244 | "intervalFactor": 1,
1245 | "legendFormat": "",
1246 | "refId": "A"
1247 | }
1248 | ],
1249 | "timeFrom": null,
1250 | "timeShift": null,
1251 | "title": "Current depth of work queue",
1252 | "type": "gauge"
1253 | },
1254 | {
1255 | "cacheTimeout": null,
1256 | "colorBackground": false,
1257 | "colorValue": false,
1258 | "colors": [
1259 | "#299c46",
1260 | "rgba(237, 129, 40, 0.89)",
1261 | "#d44a3a"
1262 | ],
1263 | "datasource": null,
1264 | "description": "Total number of retries handled by work queue",
1265 | "fieldConfig": {
1266 | "defaults": {},
1267 | "overrides": []
1268 | },
1269 | "format": "none",
1270 | "gauge": {
1271 | "maxValue": 100,
1272 | "minValue": 0,
1273 | "show": false,
1274 | "thresholdLabels": false,
1275 | "thresholdMarkers": true
1276 | },
1277 | "gridPos": {
1278 | "h": 2,
1279 | "w": 12,
1280 | "x": 0,
1281 | "y": 24
1282 | },
1283 | "id": 43,
1284 | "interval": null,
1285 | "links": [],
1286 | "mappingType": 1,
1287 | "mappingTypes": [
1288 | {
1289 | "name": "value to text",
1290 | "value": 1
1291 | },
1292 | {
1293 | "name": "range to text",
1294 | "value": 2
1295 | }
1296 | ],
1297 | "maxDataPoints": 100,
1298 | "nullPointMode": "connected",
1299 | "nullText": null,
1300 | "postfix": "",
1301 | "postfixFontSize": "50%",
1302 | "prefix": "",
1303 | "prefixFontSize": "50%",
1304 | "rangeMaps": [
1305 | {
1306 | "from": "null",
1307 | "text": "N/A",
1308 | "to": "null"
1309 | }
1310 | ],
1311 | "sparkline": {
1312 | "fillColor": "rgba(31, 118, 189, 0.18)",
1313 | "full": false,
1314 | "lineColor": "rgb(31, 120, 193)",
1315 | "show": false
1316 | },
1317 | "tableColumn": "",
1318 | "targets": [
1319 | {
1320 | "expr": "scheduled_spark_application_controller_retries{pod=\"$Spark_Operator_Instance\"}",
1321 | "format": "time_series",
1322 | "intervalFactor": 1,
1323 | "refId": "A"
1324 | }
1325 | ],
1326 | "thresholds": "",
1327 | "timeFrom": null,
1328 | "timeShift": null,
1329 | "title": "Retries handled by work queue",
1330 | "type": "singlestat",
1331 | "valueFontSize": "80%",
1332 | "valueMaps": [
1333 | {
1334 | "op": "=",
1335 | "text": "N/A",
1336 | "value": "null"
1337 | }
1338 | ],
1339 | "valueName": "current"
1340 | },
1341 | {
1342 | "cacheTimeout": null,
1343 | "colorBackground": false,
1344 | "colorValue": false,
1345 | "colors": [
1346 | "#299c46",
1347 | "rgba(237, 129, 40, 0.89)",
1348 | "#d44a3a"
1349 | ],
1350 | "datasource": null,
1351 | "description": "Unfinished work in seconds",
1352 | "fieldConfig": {
1353 | "defaults": {},
1354 | "overrides": []
1355 | },
1356 | "format": "none",
1357 | "gauge": {
1358 | "maxValue": 100,
1359 | "minValue": 0,
1360 | "show": false,
1361 | "thresholdLabels": false,
1362 | "thresholdMarkers": true
1363 | },
1364 | "gridPos": {
1365 | "h": 4,
1366 | "w": 12,
1367 | "x": 0,
1368 | "y": 26
1369 | },
1370 | "id": 41,
1371 | "interval": null,
1372 | "links": [],
1373 | "mappingType": 1,
1374 | "mappingTypes": [
1375 | {
1376 | "name": "value to text",
1377 | "value": 1
1378 | },
1379 | {
1380 | "name": "range to text",
1381 | "value": 2
1382 | }
1383 | ],
1384 | "maxDataPoints": 100,
1385 | "nullPointMode": "connected",
1386 | "nullText": null,
1387 | "postfix": " sec",
1388 | "postfixFontSize": "50%",
1389 | "prefix": "",
1390 | "prefixFontSize": "50%",
1391 | "rangeMaps": [
1392 | {
1393 | "from": "null",
1394 | "text": "N/A",
1395 | "to": "null"
1396 | }
1397 | ],
1398 | "sparkline": {
1399 | "fillColor": "rgba(31, 118, 189, 0.18)",
1400 | "full": false,
1401 | "lineColor": "rgb(31, 120, 193)",
1402 | "show": true
1403 | },
1404 | "tableColumn": "",
1405 | "targets": [
1406 | {
1407 | "expr": "scheduled_spark_application_controller_unfinished_work_seconds{pod=\"$Spark_Operator_Instance\"}",
1408 | "format": "time_series",
1409 | "intervalFactor": 1,
1410 | "refId": "A"
1411 | }
1412 | ],
1413 | "thresholds": "",
1414 | "timeFrom": null,
1415 | "timeShift": null,
1416 | "title": "Unfinished work",
1417 | "type": "singlestat",
1418 | "valueFontSize": "80%",
1419 | "valueMaps": [
1420 | {
1421 | "op": "=",
1422 | "text": "N/A",
1423 | "value": "null"
1424 | }
1425 | ],
1426 | "valueName": "current"
1427 | },
1428 | {
1429 | "cacheTimeout": null,
1430 | "colorBackground": false,
1431 | "colorValue": false,
1432 | "colors": [
1433 | "#299c46",
1434 | "rgba(237, 129, 40, 0.89)",
1435 | "#d44a3a"
1436 | ],
1437 | "datasource": null,
1438 | "description": "Longest running processor in microseconds",
1439 | "fieldConfig": {
1440 | "defaults": {},
1441 | "overrides": []
1442 | },
1443 | "format": "none",
1444 | "gauge": {
1445 | "maxValue": 100,
1446 | "minValue": 0,
1447 | "show": false,
1448 | "thresholdLabels": false,
1449 | "thresholdMarkers": true
1450 | },
1451 | "gridPos": {
1452 | "h": 4,
1453 | "w": 12,
1454 | "x": 12,
1455 | "y": 26
1456 | },
1457 | "id": 47,
1458 | "interval": null,
1459 | "links": [],
1460 | "mappingType": 1,
1461 | "mappingTypes": [
1462 | {
1463 | "name": "value to text",
1464 | "value": 1
1465 | },
1466 | {
1467 | "name": "range to text",
1468 | "value": 2
1469 | }
1470 | ],
1471 | "maxDataPoints": 100,
1472 | "nullPointMode": "connected",
1473 | "nullText": null,
1474 | "postfix": " µs",
1475 | "postfixFontSize": "50%",
1476 | "prefix": "",
1477 | "prefixFontSize": "50%",
1478 | "rangeMaps": [
1479 | {
1480 | "from": "null",
1481 | "text": "N/A",
1482 | "to": "null"
1483 | }
1484 | ],
1485 | "sparkline": {
1486 | "fillColor": "rgba(31, 118, 189, 0.18)",
1487 | "full": false,
1488 | "lineColor": "rgb(31, 120, 193)",
1489 | "show": false
1490 | },
1491 | "tableColumn": "",
1492 | "targets": [
1493 | {
1494 | "expr": "scheduled_spark_application_controller_longest_running_processor_microseconds{pod=\"$Spark_Operator_Instance\"}",
1495 | "format": "time_series",
1496 | "intervalFactor": 1,
1497 | "legendFormat": "",
1498 | "refId": "A"
1499 | }
1500 | ],
1501 | "thresholds": "",
1502 | "timeFrom": null,
1503 | "timeShift": null,
1504 | "title": "Longest running processor",
1505 | "type": "singlestat",
1506 | "valueFontSize": "80%",
1507 | "valueMaps": [
1508 | {
1509 | "op": "=",
1510 | "text": "N/A",
1511 | "value": "null"
1512 | }
1513 | ],
1514 | "valueName": "current"
1515 | }
1516 | ],
1517 | "refresh": false,
1518 | "schemaVersion": 27,
1519 | "style": "dark",
1520 | "tags": [],
1521 | "templating": {
1522 | "list": [
1523 | {
1524 | "allValue": null,
1525 | "current": {
1526 | "selected": false,
1527 | "text": "spark-operator-588dd7b89c-h6klb",
1528 | "value": "spark-operator-588dd7b89c-h6klb"
1529 | },
1530 | "datasource": "Prometheus",
1531 | "definition": "label_values(kube_pod_labels{label_app_kubernetes_io_name=\"spark-operator\"}, pod)",
1532 | "description": null,
1533 | "error": null,
1534 | "hide": 0,
1535 | "includeAll": false,
1536 | "label": "Spark Operator Instance",
1537 | "multi": false,
1538 | "name": "Spark_Operator_Instance",
1539 | "options": [],
1540 | "query": {
1541 | "query": "label_values(kube_pod_labels{label_app_kubernetes_io_name=\"spark-operator\"}, pod)",
1542 | "refId": "Prometheus-Spark_Operator_Instance-Variable-Query"
1543 | },
1544 | "refresh": 1,
1545 | "regex": "",
1546 | "skipUrlSync": false,
1547 | "sort": 0,
1548 | "tagValuesQuery": "",
1549 | "tags": [],
1550 | "tagsQuery": "",
1551 | "type": "query",
1552 | "useTags": false
1553 | }
1554 | ]
1555 | },
1556 | "time": {
1557 | "from": "now-12h",
1558 | "to": "now"
1559 | },
1560 | "timepicker": {
1561 | "refresh_intervals": [
1562 | "5s",
1563 | "10s",
1564 | "30s",
1565 | "1m",
1566 | "5m",
1567 | "15m",
1568 | "30m",
1569 | "1h",
1570 | "2h",
1571 | "1d"
1572 | ],
1573 | "time_options": [
1574 | "5m",
1575 | "15m",
1576 | "1h",
1577 | "6h",
1578 | "12h",
1579 | "24h",
1580 | "2d",
1581 | "7d",
1582 | "30d"
1583 | ]
1584 | },
1585 | "timezone": "browser",
1586 | "title": "Spark Operator",
1587 | "uid": "QK_ufP6Gk"
1588 | }
--------------------------------------------------------------------------------
/applications/platform/spark-operator/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* vim: set filetype=mustache: */}}
2 | {{/*
3 | Expand the name of the chart.
4 | */}}
5 | {{- define "service.name" -}}
6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
7 | {{- end -}}
8 |
9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "service.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 |
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "service.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 |
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "service.labels" -}}
38 | helm.sh/chart: {{ include "service.chart" . }}
39 | {{ include "service.selectorLabels" . }}
40 | {{- if .Chart.AppVersion }}
41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
42 | {{- end }}
43 | app.kubernetes.io/managed-by: {{ .Release.Service }}
44 | {{- end -}}
45 |
46 | {{/*
47 | Selector labels
48 | */}}
49 | {{- define "service.selectorLabels" -}}
50 | app.kubernetes.io/name: {{ include "service.name" . }}
51 | app.kubernetes.io/instance: {{ .Release.Name }}
52 | {{- end -}}
--------------------------------------------------------------------------------
/applications/platform/spark-operator/templates/configmap-dashboard.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | {{ (.Files.Glob "dashboards/*.json").AsConfig | indent 2 }}
4 | kind: ConfigMap
5 | metadata:
6 | namespace: monitoring
7 | annotations:
8 | labels:
9 | grafana_dashboard: {{ include "service.fullname" . }}
10 | {{- include "service.labels" . | nindent 4 }}
11 | name: {{ include "service.fullname" . }}-dashboard
12 |
--------------------------------------------------------------------------------
/applications/platform/spark-operator/values.yaml:
--------------------------------------------------------------------------------
1 | spark-operator:
2 | # Fix image tag due to using latest by default
3 | # https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/issues/1253
4 | image:
5 | tag: v1beta2-1.2.3-3.1.1
6 | webhook:
7 | enable: true
8 | podMonitor:
9 | enable: true
10 | labels:
11 | release: prometheus
12 |
--------------------------------------------------------------------------------
/applications/spark-apps/hello-argo-workflow-template/hello-argo-workflow-template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: WorkflowTemplate
3 | metadata:
4 | name: hello-argo-workflow-template
5 | spec:
6 | serviceAccountName: spark
7 | entrypoint: hello-workflow-template
8 | activeDeadlineSeconds: 10800 # 3h
9 | ttlStrategy:
10 | secondsAfterCompletion: 604800 # 7d
11 | arguments:
12 | parameters:
13 | - name: job
14 | value: 'jobName'
15 | - name: mainClass
16 | value: 'org.apache.spark.examples.SparkPi'
17 | templates:
18 | - name: hello-workflow-template
19 | inputs:
20 | parameters:
21 | - name: job
22 | - name: mainClass
23 | outputs: {}
24 | metadata: {}
25 | resource:
26 | action: create
27 | manifest: |
28 | apiVersion: "sparkoperator.k8s.io/v1beta2"
29 | kind: SparkApplication
30 | metadata:
31 | generateName: {{inputs.parameters.job}}-
32 | spec:
33 | type: Scala
34 | mode: cluster
35 | image: "gcr.io/spark-operator/spark:v3.0.0"
36 | imagePullPolicy: Always
37 | mainClass: {{inputs.parameters.mainClass}}
38 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
39 | sparkVersion: "3.0.0"
40 | restartPolicy:
41 | type: Never
42 | volumes:
43 | - name: "test-volume"
44 | hostPath:
45 | path: "/tmp"
46 | type: Directory
47 | driver:
48 | cores: 1
49 | coreLimit: "1200m"
50 | memory: "512m"
51 | labels:
52 | version: 3.0.0
53 | serviceAccount: spark
54 | volumeMounts:
55 | - name: "test-volume"
56 | mountPath: "/tmp"
57 | executor:
58 | cores: 1
59 | instances: 1
60 | memory: "512m"
61 | labels:
62 | version: 3.0.0
63 | volumeMounts:
64 | - name: "test-volume"
65 | mountPath: "/tmp"
66 | successCondition: status.applicationState.state in (COMPLETED)
67 | failureCondition: 'status.applicationState.state in (FAILED, SUBMISSION_FAILED, UNKNOWN)'
--------------------------------------------------------------------------------
/applications/spark-apps/hello-argo-workflows/hello-world-dag.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | name: dag-diamond
5 | spec:
6 | entrypoint: diamond
7 | templates:
8 | - name: echo
9 | inputs:
10 | parameters:
11 | - name: message
12 | container:
13 | image: alpine:3.7
14 | command: [echo, "{{inputs.parameters.message}}"]
15 | - name: diamond
16 | dag:
17 | tasks:
18 | - name: A
19 | template: echo
20 | arguments:
21 | parameters: [{name: message, value: A}]
22 | - name: B
23 | dependencies: [A]
24 | template: echo
25 | arguments:
26 | parameters: [{name: message, value: B}]
27 | - name: C
28 | dependencies: [A]
29 | template: echo
30 | arguments:
31 | parameters: [{name: message, value: C}]
32 | - name: D
33 | dependencies: [B, C]
34 | template: echo
35 | arguments:
36 | parameters: [{name: message, value: D}]
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator-argo-workflows/spark-operator-kubernetes-dag.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | name: spark-kubernetes-dag
5 | spec:
6 | entrypoint: sparkling-operator
7 | serviceAccountName: spark
8 | templates:
9 | - name: sparkpi
10 | resource:
11 | action: create
12 | successCondition: status.applicationState.state in (COMPLETED)
13 | failureCondition: 'status.applicationState.state in (FAILED, SUBMISSION_FAILED, UNKNOWN)'
14 | manifest: |
15 | apiVersion: "sparkoperator.k8s.io/v1beta2"
16 | kind: SparkApplication
17 | metadata:
18 | generateName: spark-pi
19 | spec:
20 | type: Scala
21 | mode: cluster
22 | image: "gcr.io/spark-operator/spark:v3.0.0"
23 | imagePullPolicy: Always
24 | mainClass: org.apache.spark.examples.SparkPi
25 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
26 | sparkVersion: "3.0.0"
27 | restartPolicy:
28 | type: Never
29 | volumes:
30 | - name: "test-volume"
31 | hostPath:
32 | path: "/tmp"
33 | type: Directory
34 | driver:
35 | cores: 1
36 | coreLimit: "1200m"
37 | memory: "512m"
38 | labels:
39 | version: 3.0.0
40 | serviceAccount: spark
41 | volumeMounts:
42 | - name: "test-volume"
43 | mountPath: "/tmp"
44 | executor:
45 | cores: 1
46 | instances: 1
47 | memory: "512m"
48 | labels:
49 | version: 3.0.0
50 | volumeMounts:
51 | - name: "test-volume"
52 | mountPath: "/tmp"
53 | - name: sparkling-operator
54 | dag:
55 | tasks:
56 | - name: SparkPi1
57 | template: sparkpi
58 | - name: SparkPi2
59 | dependencies: [SparkPi1]
60 | template: sparkpi
61 | - name: SparkPi3
62 | dependencies: [SparkPi1]
63 | template: sparkpi
64 |
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator-history/spark-application.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
2 | kind: SparkApplication
3 | metadata:
4 | name: spark-pi-history
5 | namespace: default
6 | spec:
7 | securityContext:
8 | runAsUser: 0
9 | type: Scala
10 | mode: cluster
11 | image: "k8s-test/spark-prometheus:v1.6"
12 | imagePullPolicy: Never
13 | mainClass: org.apache.spark.examples.SparkPi
14 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
15 | sparkVersion: "3.0.0"
16 | restartPolicy:
17 | type: Never
18 | driver:
19 | env:
20 | - name: AWS_ACCESS_KEY_ID
21 | valueFrom:
22 | secretKeyRef:
23 | name: aws-secrets
24 | key: AWS_ACCESS_KEY
25 | - name: AWS_SECRET_ACCESS_KEY
26 | valueFrom:
27 | secretKeyRef:
28 | name: aws-secrets
29 | key: AWS_SECRET_ACCESS_KEY
30 | - name: AWS_REGION
31 | value: "eu-west-1"
32 | cores: 1
33 | coreLimit: "1200m"
34 | memory: "512m"
35 | labels:
36 | version: 3.0.0
37 | serviceAccount: spark
38 | executor:
39 | cores: 1
40 | instances: 1
41 | memory: "512m"
42 | labels:
43 | version: 3.0.0
44 | hadoopConf:
45 | spark.hadoop.fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
46 | spark.hadoop.fs.s3a.path.style.access: "true"
47 | sparkConf:
48 | extraJavaOptions: -Dcom.amazonaws.services.s3.enableV4=true
49 | spark.eventLog.dir: 's3a://k8s-days-spain-spark/spark-logs'
50 | spark.eventLog.enabled: 'true'
51 | spark.eventLog.rolling.enabled: 'true'
52 | spark.eventLog.rolling.maxFileSize: 128m
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator-prometheus/hello-spark-operator-prometheus.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
2 | kind: SparkApplication
3 | metadata:
4 | name: spark-pi-prom
5 | namespace: default
6 | spec:
7 | type: Scala
8 | mode: cluster
9 | image: "k8s-test/spark-prometheus:v1.2"
10 | imagePullPolicy: Never
11 | mainClass: org.apache.spark.examples.SparkPi
12 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
13 | sparkVersion: "3.0.0"
14 | restartPolicy:
15 | type: Never
16 | securityContext:
17 | runAsUser: 0
18 | driver:
19 | cores: 1
20 | coreLimit: "1200m"
21 | memory: "512m"
22 | labels:
23 | version: 3.0.0
24 | serviceAccount: spark
25 | executor:
26 | cores: 1
27 | instances: 1
28 | memory: "512m"
29 | labels:
30 | version: 3.0.0
31 | monitoring:
32 | exposeDriverMetrics: true
33 | exposeExecutorMetrics: true
34 | metricsPropertiesFile: /etc/metrics/conf/metrics.properties
35 | prometheus:
36 | configFile: /etc/metrics/conf/prometheus.yaml
37 | jmxExporterJar: /opt/spark/jars/jmx_prometheus_javaagent.jar
38 | port: 8090
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator-prometheus/podMonitor.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: monitoring.coreos.com/v1
2 | kind: PodMonitor
3 | metadata:
4 | name: hello-spark-operator-prometheus
5 | labels:
6 | release: prometheus
7 | spec:
8 | selector:
9 | matchLabels:
10 | sparkoperator.k8s.io/app-name: spark-pi-prom
11 | podMetricsEndpoints:
12 | - relabelings:
13 | - action: replace
14 | targetLabel: __address__
15 | sourceLabels:
16 | - __meta_kubernetes_pod_ip
17 | replacement: $1:8090
18 | path: /
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator/ingress.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: Ingress
3 | metadata:
4 | labels:
5 | app.kubernetes.io/name: hello-spark-operator
6 | name: hello-spark-operator
7 | namespace: default
8 | spec:
9 | rules:
10 | - host: hello-spark-operator.localhost
11 | http:
12 | paths:
13 | - backend:
14 | serviceName: spark-pi-ui-svc
15 | servicePort: 4040
16 | pathType: ImplementationSpecific
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-operator/spark-application.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
2 | kind: SparkApplication
3 | metadata:
4 | name: spark-pi
5 | namespace: default
6 | spec:
7 | type: Scala
8 | mode: cluster
9 | image: "gcr.io/spark-operator/spark:v3.0.0"
10 | imagePullPolicy: Always
11 | mainClass: org.apache.spark.examples.SparkPi
12 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
13 | sparkVersion: "3.0.0"
14 | restartPolicy:
15 | type: Never
16 | volumes:
17 | - name: "test-volume"
18 | hostPath:
19 | path: "/tmp"
20 | type: Directory
21 | driver:
22 | cores: 1
23 | coreLimit: "1200m"
24 | memory: "512m"
25 | labels:
26 | version: 3.0.0
27 | serviceAccount: spark
28 | volumeMounts:
29 | - name: "test-volume"
30 | mountPath: "/tmp"
31 | executor:
32 | cores: 1
33 | instances: 1
34 | memory: "512m"
35 | labels:
36 | version: 3.0.0
37 | volumeMounts:
38 | - name: "test-volume"
39 | mountPath: "/tmp"
--------------------------------------------------------------------------------
/applications/spark-apps/hello-spark-submit/test-job-example.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: spark-on-localhost-example
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: spark
10 | image: k8s-test/spark:v2.4.4
11 | imagePullPolicy: Never
12 | command: [
13 | "/bin/sh",
14 | "-c",
15 | "/opt/spark/bin/spark-submit \
16 | --master k8s://https://172.18.0.2:6443 \
17 | --deploy-mode cluster \
18 | --name spark-pi \
19 | --class org.apache.spark.examples.SparkPi \
20 | --conf spark.executor.instances=2 \
21 | --conf spark.kubernetes.container.image=k8s-test/spark-on-localhost:v1.0 \
22 | --conf spark.kubernetes.container.image.pullPolicy=Never \
23 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
24 | --executor-memory 500M \
25 | /opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar 100"
26 | ]
27 | serviceAccountName: spark
28 | restartPolicy: Never
29 | backoffLimit: 4
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: spark-history-server
3 | description: Helm chart to package Spark History Server
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.0.0
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | Expand the name of the chart.
3 | */}}
4 | {{- define "sparkHistoryServer.name" -}}
5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6 | {{- end }}
7 |
8 | {{/*
9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "sparkHistoryServer.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 |
26 |
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "sparkHistoryServer.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
32 | {{- end }}
33 |
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "sparkHistoryServer.labels" -}}
38 | helm.sh/chart: {{ include "sparkHistoryServer.chart" . }}
39 | {{- if .Values.team }}
40 | app.kubernetes.io/team: {{ .Values.team | quote }}
41 | {{- end }}
42 | {{- if .Values.project }}
43 | app.kubernetes.io/project: {{ .Values.project | quote }}
44 | {{- end }}
45 | app.kubernetes.io/application: {{ include "sparkHistoryServer.fullname" . | quote }}
46 | {{ include "sparkHistoryServer.selectorLabels" . }}
47 | {{- if .Chart.Version }}
48 | app.kubernetes.io/version: {{ .Chart.Version | quote }}
49 | {{- end }}
50 | app.kubernetes.io/managed-by: {{ .Release.Service | quote }}
51 | {{- end }}
52 |
53 | {{/*
54 | Selector labels
55 | */}}
56 | {{- define "sparkHistoryServer.selectorLabels" -}}
57 | app.kubernetes.io/name: {{ include "sparkHistoryServer.name" . | quote }}
58 | app.kubernetes.io/instance: {{ .Release.Name | quote }}
59 | {{- end }}
60 |
61 |
62 | {{/*
63 | Name of the role and role binding to use
64 | */}}
65 | {{- define "sparkHistoryServer.rbacName" -}}
66 | {{- default (include "sparkHistoryServer.fullname" .) .Values.rbac.name }}
67 | {{- end }}
68 |
69 |
70 |
71 | {{/*
72 | Create the name of the spark-hs service account to use
73 | */}}
74 | {{- define "sparkHistoryServer.serviceAccountName" -}}
75 | {{- if .Values.sparkHistoryServer.serviceAccount.create -}}
76 | {{ default (include "sparkHistoryServer.fullname" .) .Values.sparkHistoryServer.serviceAccount.name }}
77 | {{- else -}}
78 | {{ default "default" .Values.sparkHistoryServer.serviceAccount.name }}
79 | {{- end -}}
80 | {{- end -}}
81 |
82 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.sparkHistoryServer.create -}}
2 | kind: ConfigMap
3 | apiVersion: v1
4 | metadata:
5 | name: {{ template "sparkHistoryServer.fullname" . }}
6 | labels:
7 | {{- include "sparkHistoryServer.labels" . | nindent 4 }}
8 | data:
9 | spark-defaults.conf: |-
10 | spark.history.fs.eventLog.rolling.maxFilesToRetain=5
11 | spark.history.fs.cleaner.enabled=true
12 | {{- end }}
13 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.sparkHistoryServer.create -}}
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: {{ template "sparkHistoryServer.fullname" . }}
6 | labels:
7 | {{- include "sparkHistoryServer.labels" . | nindent 4 }}
8 | spec:
9 | replicas: {{ .Values.sparkHistoryServer.replicaCount }}
10 | revisionHistoryLimit: 3
11 | strategy:
12 | type: RollingUpdate
13 | rollingUpdate:
14 | maxUnavailable: 50%
15 | maxSurge: 1
16 | selector:
17 | matchLabels:
18 | {{- include "sparkHistoryServer.selectorLabels" . | nindent 6 }}
19 | template:
20 | metadata:
21 | labels:
22 | {{- include "sparkHistoryServer.selectorLabels" . | nindent 8 }}
23 | spec:
24 | {{- with $.Values.image.pullSecrets }}
25 | imagePullSecrets:
26 | {{- toYaml . | nindent 8 }}
27 | {{- end }}
28 | serviceAccountName: {{ include "sparkHistoryServer.serviceAccountName" . }}
29 | containers:
30 | - name: {{ .Chart.Name }}
31 | image: "{{ $.Values.image.repository }}:{{ $.Values.image.tag | default $.Chart.Version }}"
32 | imagePullPolicy: {{ $.Values.image.pullPolicy }}
33 | command:
34 | - '/opt/bitnami/spark/sbin/start-history-server.sh'
35 | env:
36 | - name: SPARK_NO_DAEMONIZE
37 | value: "false"
38 | - name: SPARK_HISTORY_OPTS
39 | value: "-Dspark.history.fs.logDirectory={{ .Values.sparkHistoryServer.s3logDir }}"
40 | - name: SPARK_CONF_DIR
41 | value: /opt/bitnami/spark/conf
42 | - name: AWS_ACCESS_KEY_ID
43 | valueFrom:
44 | secretKeyRef:
45 | name: aws-secrets
46 | key: AWS_ACCESS_KEY
47 | - name: AWS_SECRET_ACCESS_KEY
48 | valueFrom:
49 | secretKeyRef:
50 | name: aws-secrets
51 | key: AWS_SECRET_ACCESS_KEY
52 | - name: AWS_REGION
53 | value: "eu-west-1"
54 | volumeMounts:
55 | - name: config-volume
56 | mountPath: /opt/bitnami/spark/conf/spark-defaults.conf
57 | subPath: spark-defaults.conf
58 | ports:
59 | - name: http
60 | containerPort: {{ .Values.sparkHistoryServer.service.internalPort }}
61 | protocol: TCP
62 | terminationMessagePath: /dev/termination-log
63 | terminationMessagePolicy: File
64 | resources:
65 | {{ toYaml .Values.sparkHistoryServer.resources | indent 12 }}
66 | volumes:
67 | - name: config-volume
68 | configMap:
69 | name: {{ template "sparkHistoryServer.fullname" . }}
70 | {{- if .Values.sparkHistoryServer.securityContext }}
71 | securityContext: {{- toYaml .Values.sparkHistoryServer.securityContext | nindent 8 }}
72 | {{- end }}
73 | {{- end }}
74 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.sparkHistoryServer.create -}}
2 | {{- if .Values.sparkHistoryServer.ingress.enabled -}}
3 | {{- $serviceName := include "sparkHistoryServer.fullname" . -}}
4 | {{- $servicePort := .Values.sparkHistoryServer.service.externalPort -}}
5 | apiVersion: networking.k8s.io/v1beta1
6 | kind: Ingress
7 | metadata:
8 | name: {{ template "sparkHistoryServer.fullname" . }}
9 | labels:
10 | {{- include "sparkHistoryServer.labels" . | nindent 4 }}
11 | {{- with .Values.sparkHistoryServer.ingress.annotations }}
12 | annotations:
13 | {{ toYaml . | indent 4 }}
14 | {{- end }}
15 | spec:
16 | rules:
17 | - host: {{ .Values.sparkHistoryServer.ingress.host }}
18 | http:
19 | paths:
20 | - path: {{ .Values.sparkHistoryServer.ingress.path }}
21 | backend:
22 | serviceName: {{ $serviceName }}
23 | servicePort: {{ $servicePort }}
24 | {{- end }}
25 | {{- end }}
26 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/service.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.sparkHistoryServer.create -}}
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | name: {{ template "sparkHistoryServer.fullname" . }}
6 | labels:
7 | {{- include "sparkHistoryServer.labels" . | nindent 4 }}
8 | spec:
9 | type: {{ .Values.sparkHistoryServer.service.type }}
10 | ports:
11 | - port: {{ .Values.sparkHistoryServer.service.externalPort }}
12 | targetPort: {{ .Values.sparkHistoryServer.service.internalPort }}
13 | protocol: TCP
14 | name: {{ .Chart.Name }}
15 | selector:
16 | {{- include "sparkHistoryServer.selectorLabels" . | nindent 6 }}
17 | {{- end }}
18 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.sparkHistoryServer.create -}}
2 | {{- if .Values.sparkHistoryServer.serviceAccount.create }}
3 | apiVersion: v1
4 | kind: ServiceAccount
5 | metadata:
6 | name: {{ template "sparkHistoryServer.serviceAccountName" . }}
7 | labels:
8 | {{- include "sparkHistoryServer.labels" . | nindent 4 }}
9 | {{- with .Values.sparkHistoryServer.serviceAccount.annotations }}
10 | annotations:
11 | {{- toYaml . | nindent 4 }}
12 | {{- end }}
13 | {{- end }}
14 | {{- end }}
15 |
--------------------------------------------------------------------------------
/applications/spark-apps/spark-history-server/values.yaml:
--------------------------------------------------------------------------------
1 | # -- Set runtime image to use for running the jobs
2 | # @default -- -
3 | image:
4 | repository: bitnami/spark
5 | # -- Tag to use
6 | # @default -- *app-version*
7 | tag: "3.0.1"
8 | # -- Pull policy to apply
9 | pullPolicy: IfNotPresent
10 | # -- Image pull secrets
11 | pullSecrets: [ ]
12 |
13 |
14 |
15 | # -- Spark history server configuration - the history server uses the same image configuration as the jobs
16 | # and unless specified resources are created as app-name-history-server. If enabled, the job sparkConfig
17 | # will include the necessary logging configs
18 | # @default -- -
19 | sparkHistoryServer:
20 | # -- Whether to create a history server
21 | create: true
22 | # -- S3 path to read logs from
23 | # @default -- **required**
24 | s3logDir: "s3a://k8s-days-spain-spark/spark-logs/"
25 |
26 | ingress:
27 | # -- Whether the ingress for the history server should be enabled
28 | enabled: true
29 | # -- History server ingress host
30 | # @default -- **required**
31 | host: 'spark-history-server.localhost'
32 | path: '/'
33 | annotations: { }
34 |
35 | service:
36 | externalPort: 80
37 | internalPort: 18080
38 | type: ClusterIP
39 |
40 | # Undocumented
41 | replicaCount: 1
42 | resources:
43 | limits:
44 | #cpu: 500m
45 | memory: 512Mi
46 |
47 | requests:
48 | cpu: 250m
49 | memory: 256Mi
50 |
51 | securityContext:
52 | runAsUser: 0
53 |
54 | serviceAccount:
55 | # -- Whether a ServiceAccount for the Spark History server should be created
56 | create: true
57 | # -- The name of the ServiceAccount to create or use
58 | # @default -- *app-name-history-service*
59 | name: ''
60 | # Annotations for the Service Account
61 | # Example:
62 | # annotations:
63 | # eks.amazonaws.com/role-arn: arn:aws:iam::xxxxx:role/spark-history-server
64 | annotations: {}
65 |
--------------------------------------------------------------------------------
/deployments/applications/platform/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: Applications
4 | name: applications
5 | version: 0.2.0
6 |
--------------------------------------------------------------------------------
/deployments/applications/platform/argo-apps.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: platform-apps
5 | namespace: default
6 | spec:
7 | project: default
8 | source:
9 | repoURL: 'https://github.com/empathyco/platform-spark-kubernetes-samples.git'
10 | path: deployments/applications/platform
11 | targetRevision: main
12 | destination:
13 | server: 'https://kubernetes.default.svc'
14 | namespace: default
15 | syncPolicy:
16 | automated:
17 | prune: false
18 | selfHeal: true
19 |
--------------------------------------------------------------------------------
/deployments/applications/platform/templates/argo-workflows.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: argo-workflows
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/platform/argo-workflows
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/platform/templates/prometheus.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: prometheus
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: monitoring
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/platform/monitoring/prometheus
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/platform/templates/rbac.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: rbac
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/platform/rbac
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/platform/templates/spark-operator.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: spark-operator
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/platform/spark-operator
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/platform/values.yaml:
--------------------------------------------------------------------------------
1 | project: default
2 | spec:
3 | source:
4 | repoURL: 'https://github.com/empathyco/platform-spark-kubernetes-samples.git'
5 | targetRevision: main
6 | destination:
7 | server: 'https://kubernetes.default.svc'
8 | syncPolicy:
9 | automated:
10 | prune: true
11 | selfHeal: true
12 | syncOptions:
13 | - CreateNamespace=true
14 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: Applications
4 | name: applications
5 | version: 0.2.0
6 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/argo-apps.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: spark-apps
5 | namespace: default
6 | spec:
7 | project: default
8 | source:
9 | repoURL: 'https://github.com/empathyco/platform-spark-kubernetes-samples.git'
10 | path: deployments/applications/spark-apps
11 | targetRevision: main
12 | destination:
13 | server: 'https://kubernetes.default.svc'
14 | namespace: default
15 | syncPolicy:
16 | automated:
17 | prune: false
18 | selfHeal: true
19 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-argo-workflow-template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-argo-workflow-template
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-argo-workflow-template
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-argo-workflows.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-argo-workflows
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-argo-workflows
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-spark-operator-argo-workflows.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-spark-operator-argo-workflows
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-spark-operator-argo-workflows
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-spark-operator-history.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-spark-operator-history
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-spark-operator-history
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-spark-operator.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-spark-operator
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-spark-operator
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/hello-spark-submit.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: hello-spark-submit
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/hello-spark-submit
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/templates/spark-history-server.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Application
3 | metadata:
4 | name: spark-history-server
5 | namespace: default
6 | spec:
7 | destination:
8 | namespace: default
9 | server: {{ .Values.spec.destination.server }}
10 | project: {{ .Values.project }}
11 | source:
12 | path: applications/spark-apps/spark-history-server
13 | repoURL: {{ .Values.spec.source.repoURL }}
14 | targetRevision: {{ .Values.spec.source.targetRevision }}
15 |
--------------------------------------------------------------------------------
/deployments/applications/spark-apps/values.yaml:
--------------------------------------------------------------------------------
1 | project: default
2 | spec:
3 | source:
4 | repoURL: 'https://github.com/empathyco/platform-spark-kubernetes-samples.git'
5 | targetRevision: main
6 | destination:
7 | server: 'https://kubernetes.default.svc'
8 | syncPolicy:
9 | automated:
10 | prune: true
11 | selfHeal: true
12 | syncOptions:
13 | - CreateNamespace=true
14 |
--------------------------------------------------------------------------------
/kind/cluster-conf.yaml:
--------------------------------------------------------------------------------
1 | kind: Cluster
2 | apiVersion: kind.x-k8s.io/v1alpha4
3 | nodes:
4 | - role: control-plane
5 | kubeadmConfigPatches:
6 | - |
7 | kind: InitConfiguration
8 | nodeRegistration:
9 | kubeletExtraArgs:
10 | node-labels: "ingress-ready=true"
11 | extraPortMappings:
12 | - containerPort: 80
13 | hostPort: 80
14 | protocol: TCP
15 | - containerPort: 443
16 | hostPort: 443
17 | protocol: TCP
18 | - containerPort: 30211
19 | hostPort: 8085
20 | protocol: TCP
--------------------------------------------------------------------------------
/kind/example.yaml:
--------------------------------------------------------------------------------
1 | kind: Pod
2 | apiVersion: v1
3 | metadata:
4 | name: foo-app
5 | labels:
6 | app: foo
7 | spec:
8 | containers:
9 | - name: foo-app
10 | image: hashicorp/http-echo:0.2.3
11 | args:
12 | - "-text=foo"
13 | ---
14 | kind: Service
15 | apiVersion: v1
16 | metadata:
17 | name: foo-service
18 | spec:
19 | selector:
20 | app: foo
21 | ports:
22 | # Default port used by the image
23 | - port: 5678
24 | ---
25 | kind: Pod
26 | apiVersion: v1
27 | metadata:
28 | name: bar-app
29 | labels:
30 | app: bar
31 | spec:
32 | containers:
33 | - name: bar-app
34 | image: hashicorp/http-echo:0.2.3
35 | args:
36 | - "-text=bar"
37 | ---
38 | kind: Service
39 | apiVersion: v1
40 | metadata:
41 | name: bar-service
42 | spec:
43 | selector:
44 | app: bar
45 | ports:
46 | # Default port used by the image
47 | - port: 5678
48 | ---
49 | apiVersion: networking.k8s.io/v1beta1
50 | kind: Ingress
51 | metadata:
52 | name: example-ingress
53 | spec:
54 | rules:
55 | - host: test.localhost
56 | http:
57 | paths:
58 | - path: /foo
59 | backend:
60 | serviceName: foo-service
61 | servicePort: 5678
62 | - path: /bar
63 | backend:
64 | serviceName: bar-service
65 | servicePort: 5678
66 | ---
--------------------------------------------------------------------------------
/kind/ingress-nginx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: ingress-nginx
5 | labels:
6 | app.kubernetes.io/name: ingress-nginx
7 | app.kubernetes.io/instance: ingress-nginx
8 |
9 | ---
10 | # Source: ingress-nginx/templates/controller-serviceaccount.yaml
11 | apiVersion: v1
12 | kind: ServiceAccount
13 | metadata:
14 | labels:
15 | helm.sh/chart: ingress-nginx-3.4.0
16 | app.kubernetes.io/name: ingress-nginx
17 | app.kubernetes.io/instance: ingress-nginx
18 | app.kubernetes.io/version: 0.40.1
19 | app.kubernetes.io/managed-by: Helm
20 | app.kubernetes.io/component: controller
21 | name: ingress-nginx
22 | namespace: ingress-nginx
23 | ---
24 | # Source: ingress-nginx/templates/controller-configmap.yaml
25 | apiVersion: v1
26 | kind: ConfigMap
27 | metadata:
28 | labels:
29 | helm.sh/chart: ingress-nginx-3.4.0
30 | app.kubernetes.io/name: ingress-nginx
31 | app.kubernetes.io/instance: ingress-nginx
32 | app.kubernetes.io/version: 0.40.1
33 | app.kubernetes.io/managed-by: Helm
34 | app.kubernetes.io/component: controller
35 | name: ingress-nginx-controller
36 | namespace: ingress-nginx
37 | data:
38 | ---
39 | # Source: ingress-nginx/templates/clusterrole.yaml
40 | apiVersion: rbac.authorization.k8s.io/v1
41 | kind: ClusterRole
42 | metadata:
43 | labels:
44 | helm.sh/chart: ingress-nginx-3.4.0
45 | app.kubernetes.io/name: ingress-nginx
46 | app.kubernetes.io/instance: ingress-nginx
47 | app.kubernetes.io/version: 0.40.1
48 | app.kubernetes.io/managed-by: Helm
49 | name: ingress-nginx
50 | rules:
51 | - apiGroups:
52 | - ''
53 | resources:
54 | - configmaps
55 | - endpoints
56 | - nodes
57 | - pods
58 | - secrets
59 | verbs:
60 | - list
61 | - watch
62 | - apiGroups:
63 | - ''
64 | resources:
65 | - nodes
66 | verbs:
67 | - get
68 | - apiGroups:
69 | - ''
70 | resources:
71 | - services
72 | verbs:
73 | - get
74 | - list
75 | - update
76 | - watch
77 | - apiGroups:
78 | - extensions
79 | - networking.k8s.io # k8s 1.14+
80 | resources:
81 | - ingresses
82 | verbs:
83 | - get
84 | - list
85 | - watch
86 | - apiGroups:
87 | - ''
88 | resources:
89 | - events
90 | verbs:
91 | - create
92 | - patch
93 | - apiGroups:
94 | - extensions
95 | - networking.k8s.io # k8s 1.14+
96 | resources:
97 | - ingresses/status
98 | verbs:
99 | - update
100 | - apiGroups:
101 | - networking.k8s.io # k8s 1.14+
102 | resources:
103 | - ingressclasses
104 | verbs:
105 | - get
106 | - list
107 | - watch
108 | ---
109 | # Source: ingress-nginx/templates/clusterrolebinding.yaml
110 | apiVersion: rbac.authorization.k8s.io/v1
111 | kind: ClusterRoleBinding
112 | metadata:
113 | labels:
114 | helm.sh/chart: ingress-nginx-3.4.0
115 | app.kubernetes.io/name: ingress-nginx
116 | app.kubernetes.io/instance: ingress-nginx
117 | app.kubernetes.io/version: 0.40.1
118 | app.kubernetes.io/managed-by: Helm
119 | name: ingress-nginx
120 | roleRef:
121 | apiGroup: rbac.authorization.k8s.io
122 | kind: ClusterRole
123 | name: ingress-nginx
124 | subjects:
125 | - kind: ServiceAccount
126 | name: ingress-nginx
127 | namespace: ingress-nginx
128 | ---
129 | # Source: ingress-nginx/templates/controller-role.yaml
130 | apiVersion: rbac.authorization.k8s.io/v1
131 | kind: Role
132 | metadata:
133 | labels:
134 | helm.sh/chart: ingress-nginx-3.4.0
135 | app.kubernetes.io/name: ingress-nginx
136 | app.kubernetes.io/instance: ingress-nginx
137 | app.kubernetes.io/version: 0.40.1
138 | app.kubernetes.io/managed-by: Helm
139 | app.kubernetes.io/component: controller
140 | name: ingress-nginx
141 | namespace: ingress-nginx
142 | rules:
143 | - apiGroups:
144 | - ''
145 | resources:
146 | - namespaces
147 | verbs:
148 | - get
149 | - apiGroups:
150 | - ''
151 | resources:
152 | - configmaps
153 | - pods
154 | - secrets
155 | - endpoints
156 | verbs:
157 | - get
158 | - list
159 | - watch
160 | - apiGroups:
161 | - ''
162 | resources:
163 | - services
164 | verbs:
165 | - get
166 | - list
167 | - update
168 | - watch
169 | - apiGroups:
170 | - extensions
171 | - networking.k8s.io # k8s 1.14+
172 | resources:
173 | - ingresses
174 | verbs:
175 | - get
176 | - list
177 | - watch
178 | - apiGroups:
179 | - extensions
180 | - networking.k8s.io # k8s 1.14+
181 | resources:
182 | - ingresses/status
183 | verbs:
184 | - update
185 | - apiGroups:
186 | - networking.k8s.io # k8s 1.14+
187 | resources:
188 | - ingressclasses
189 | verbs:
190 | - get
191 | - list
192 | - watch
193 | - apiGroups:
194 | - ''
195 | resources:
196 | - configmaps
197 | resourceNames:
198 | - ingress-controller-leader-nginx
199 | verbs:
200 | - get
201 | - update
202 | - apiGroups:
203 | - ''
204 | resources:
205 | - configmaps
206 | verbs:
207 | - create
208 | - apiGroups:
209 | - ''
210 | resources:
211 | - endpoints
212 | verbs:
213 | - create
214 | - get
215 | - update
216 | - apiGroups:
217 | - ''
218 | resources:
219 | - events
220 | verbs:
221 | - create
222 | - patch
223 | ---
224 | # Source: ingress-nginx/templates/controller-rolebinding.yaml
225 | apiVersion: rbac.authorization.k8s.io/v1
226 | kind: RoleBinding
227 | metadata:
228 | labels:
229 | helm.sh/chart: ingress-nginx-3.4.0
230 | app.kubernetes.io/name: ingress-nginx
231 | app.kubernetes.io/instance: ingress-nginx
232 | app.kubernetes.io/version: 0.40.1
233 | app.kubernetes.io/managed-by: Helm
234 | app.kubernetes.io/component: controller
235 | name: ingress-nginx
236 | namespace: ingress-nginx
237 | roleRef:
238 | apiGroup: rbac.authorization.k8s.io
239 | kind: Role
240 | name: ingress-nginx
241 | subjects:
242 | - kind: ServiceAccount
243 | name: ingress-nginx
244 | namespace: ingress-nginx
245 | ---
246 | # Source: ingress-nginx/templates/controller-service-webhook.yaml
247 | apiVersion: v1
248 | kind: Service
249 | metadata:
250 | labels:
251 | helm.sh/chart: ingress-nginx-3.4.0
252 | app.kubernetes.io/name: ingress-nginx
253 | app.kubernetes.io/instance: ingress-nginx
254 | app.kubernetes.io/version: 0.40.1
255 | app.kubernetes.io/managed-by: Helm
256 | app.kubernetes.io/component: controller
257 | name: ingress-nginx-controller-admission
258 | namespace: ingress-nginx
259 | spec:
260 | type: ClusterIP
261 | ports:
262 | - name: https-webhook
263 | port: 443
264 | targetPort: webhook
265 | selector:
266 | app.kubernetes.io/name: ingress-nginx
267 | app.kubernetes.io/instance: ingress-nginx
268 | app.kubernetes.io/component: controller
269 | ---
270 | # Source: ingress-nginx/templates/controller-service.yaml
271 | apiVersion: v1
272 | kind: Service
273 | metadata:
274 | labels:
275 | helm.sh/chart: ingress-nginx-3.4.0
276 | app.kubernetes.io/name: ingress-nginx
277 | app.kubernetes.io/instance: ingress-nginx
278 | app.kubernetes.io/version: 0.40.1
279 | app.kubernetes.io/managed-by: Helm
280 | app.kubernetes.io/component: controller
281 | name: ingress-nginx-controller
282 | namespace: ingress-nginx
283 | spec:
284 | type: NodePort
285 | ports:
286 | - name: http
287 | port: 80
288 | protocol: TCP
289 | targetPort: http
290 | - name: https
291 | port: 443
292 | protocol: TCP
293 | targetPort: https
294 | selector:
295 | app.kubernetes.io/name: ingress-nginx
296 | app.kubernetes.io/instance: ingress-nginx
297 | app.kubernetes.io/component: controller
298 | ---
299 | # Source: ingress-nginx/templates/controller-deployment.yaml
300 | apiVersion: apps/v1
301 | kind: Deployment
302 | metadata:
303 | labels:
304 | helm.sh/chart: ingress-nginx-3.4.0
305 | app.kubernetes.io/name: ingress-nginx
306 | app.kubernetes.io/instance: ingress-nginx
307 | app.kubernetes.io/version: 0.40.1
308 | app.kubernetes.io/managed-by: Helm
309 | app.kubernetes.io/component: controller
310 | name: ingress-nginx-controller
311 | namespace: ingress-nginx
312 | spec:
313 | selector:
314 | matchLabels:
315 | app.kubernetes.io/name: ingress-nginx
316 | app.kubernetes.io/instance: ingress-nginx
317 | app.kubernetes.io/component: controller
318 | revisionHistoryLimit: 10
319 | strategy:
320 | rollingUpdate:
321 | maxUnavailable: 1
322 | type: RollingUpdate
323 | minReadySeconds: 0
324 | template:
325 | metadata:
326 | labels:
327 | app.kubernetes.io/name: ingress-nginx
328 | app.kubernetes.io/instance: ingress-nginx
329 | app.kubernetes.io/component: controller
330 | spec:
331 | dnsPolicy: ClusterFirst
332 | containers:
333 | - name: controller
334 | image: k8s.gcr.io/ingress-nginx/controller:v0.40.1@sha256:abffcf2d25e3e7c7b67a315a7c664ec79a1588c9c945d3c7a75637c2f55caec6
335 | imagePullPolicy: IfNotPresent
336 | lifecycle:
337 | preStop:
338 | exec:
339 | command:
340 | - /wait-shutdown
341 | args:
342 | - /nginx-ingress-controller
343 | - --election-id=ingress-controller-leader
344 | - --ingress-class=nginx
345 | - --configmap=$(POD_NAMESPACE)/ingress-nginx-controller
346 | - --validating-webhook=:8443
347 | - --validating-webhook-certificate=/usr/local/certificates/cert
348 | - --validating-webhook-key=/usr/local/certificates/key
349 | - --publish-status-address=localhost
350 | securityContext:
351 | capabilities:
352 | drop:
353 | - ALL
354 | add:
355 | - NET_BIND_SERVICE
356 | runAsUser: 101
357 | allowPrivilegeEscalation: true
358 | env:
359 | - name: POD_NAME
360 | valueFrom:
361 | fieldRef:
362 | fieldPath: metadata.name
363 | - name: POD_NAMESPACE
364 | valueFrom:
365 | fieldRef:
366 | fieldPath: metadata.namespace
367 | - name: LD_PRELOAD
368 | value: /usr/local/lib/libmimalloc.so
369 | livenessProbe:
370 | httpGet:
371 | path: /healthz
372 | port: 10254
373 | scheme: HTTP
374 | initialDelaySeconds: 10
375 | periodSeconds: 10
376 | timeoutSeconds: 1
377 | successThreshold: 1
378 | failureThreshold: 5
379 | readinessProbe:
380 | httpGet:
381 | path: /healthz
382 | port: 10254
383 | scheme: HTTP
384 | initialDelaySeconds: 10
385 | periodSeconds: 10
386 | timeoutSeconds: 1
387 | successThreshold: 1
388 | failureThreshold: 3
389 | ports:
390 | - name: http
391 | containerPort: 80
392 | protocol: TCP
393 | hostPort: 80
394 | - name: https
395 | containerPort: 443
396 | protocol: TCP
397 | hostPort: 443
398 | - name: webhook
399 | containerPort: 8443
400 | protocol: TCP
401 | volumeMounts:
402 | - name: webhook-cert
403 | mountPath: /usr/local/certificates/
404 | readOnly: true
405 | resources:
406 | requests:
407 | cpu: 100m
408 | memory: 90Mi
409 | nodeSelector:
410 | ingress-ready: 'true'
411 | tolerations:
412 | - effect: NoSchedule
413 | key: node-role.kubernetes.io/master
414 | operator: Equal
415 | serviceAccountName: ingress-nginx
416 | terminationGracePeriodSeconds: 0
417 | volumes:
418 | - name: webhook-cert
419 | secret:
420 | secretName: ingress-nginx-admission
421 | ---
422 | # Source: ingress-nginx/templates/admission-webhooks/validating-webhook.yaml
423 | # before changing this value, check the required kubernetes version
424 | # https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#prerequisites
425 | apiVersion: admissionregistration.k8s.io/v1
426 | kind: ValidatingWebhookConfiguration
427 | metadata:
428 | labels:
429 | helm.sh/chart: ingress-nginx-3.4.0
430 | app.kubernetes.io/name: ingress-nginx
431 | app.kubernetes.io/instance: ingress-nginx
432 | app.kubernetes.io/version: 0.40.1
433 | app.kubernetes.io/managed-by: Helm
434 | app.kubernetes.io/component: admission-webhook
435 | name: ingress-nginx-admission
436 | webhooks:
437 | - name: validate.nginx.ingress.kubernetes.io
438 | rules:
439 | - apiGroups:
440 | - networking.k8s.io
441 | apiVersions:
442 | - v1beta1
443 | - v1
444 | operations:
445 | - CREATE
446 | - UPDATE
447 | resources:
448 | - ingresses
449 | failurePolicy: Fail
450 | sideEffects: None
451 | admissionReviewVersions:
452 | - v1
453 | - v1beta1
454 | clientConfig:
455 | service:
456 | namespace: ingress-nginx
457 | name: ingress-nginx-controller-admission
458 | path: /networking/v1beta1/ingresses
459 | ---
460 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/serviceaccount.yaml
461 | apiVersion: v1
462 | kind: ServiceAccount
463 | metadata:
464 | name: ingress-nginx-admission
465 | annotations:
466 | helm.sh/hook: pre-install,pre-upgrade,post-install,post-upgrade
467 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
468 | labels:
469 | helm.sh/chart: ingress-nginx-3.4.0
470 | app.kubernetes.io/name: ingress-nginx
471 | app.kubernetes.io/instance: ingress-nginx
472 | app.kubernetes.io/version: 0.40.1
473 | app.kubernetes.io/managed-by: Helm
474 | app.kubernetes.io/component: admission-webhook
475 | namespace: ingress-nginx
476 | ---
477 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/clusterrole.yaml
478 | apiVersion: rbac.authorization.k8s.io/v1
479 | kind: ClusterRole
480 | metadata:
481 | name: ingress-nginx-admission
482 | annotations:
483 | helm.sh/hook: pre-install,pre-upgrade,post-install,post-upgrade
484 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
485 | labels:
486 | helm.sh/chart: ingress-nginx-3.4.0
487 | app.kubernetes.io/name: ingress-nginx
488 | app.kubernetes.io/instance: ingress-nginx
489 | app.kubernetes.io/version: 0.40.1
490 | app.kubernetes.io/managed-by: Helm
491 | app.kubernetes.io/component: admission-webhook
492 | rules:
493 | - apiGroups:
494 | - admissionregistration.k8s.io
495 | resources:
496 | - validatingwebhookconfigurations
497 | verbs:
498 | - get
499 | - update
500 | ---
501 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/clusterrolebinding.yaml
502 | apiVersion: rbac.authorization.k8s.io/v1
503 | kind: ClusterRoleBinding
504 | metadata:
505 | name: ingress-nginx-admission
506 | annotations:
507 | helm.sh/hook: pre-install,pre-upgrade,post-install,post-upgrade
508 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
509 | labels:
510 | helm.sh/chart: ingress-nginx-3.4.0
511 | app.kubernetes.io/name: ingress-nginx
512 | app.kubernetes.io/instance: ingress-nginx
513 | app.kubernetes.io/version: 0.40.1
514 | app.kubernetes.io/managed-by: Helm
515 | app.kubernetes.io/component: admission-webhook
516 | roleRef:
517 | apiGroup: rbac.authorization.k8s.io
518 | kind: ClusterRole
519 | name: ingress-nginx-admission
520 | subjects:
521 | - kind: ServiceAccount
522 | name: ingress-nginx-admission
523 | namespace: ingress-nginx
524 | ---
525 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/role.yaml
526 | apiVersion: rbac.authorization.k8s.io/v1
527 | kind: Role
528 | metadata:
529 | name: ingress-nginx-admission
530 | annotations:
531 | helm.sh/hook: pre-install,pre-upgrade,post-install,post-upgrade
532 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
533 | labels:
534 | helm.sh/chart: ingress-nginx-3.4.0
535 | app.kubernetes.io/name: ingress-nginx
536 | app.kubernetes.io/instance: ingress-nginx
537 | app.kubernetes.io/version: 0.40.1
538 | app.kubernetes.io/managed-by: Helm
539 | app.kubernetes.io/component: admission-webhook
540 | namespace: ingress-nginx
541 | rules:
542 | - apiGroups:
543 | - ''
544 | resources:
545 | - secrets
546 | verbs:
547 | - get
548 | - create
549 | ---
550 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/rolebinding.yaml
551 | apiVersion: rbac.authorization.k8s.io/v1
552 | kind: RoleBinding
553 | metadata:
554 | name: ingress-nginx-admission
555 | annotations:
556 | helm.sh/hook: pre-install,pre-upgrade,post-install,post-upgrade
557 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
558 | labels:
559 | helm.sh/chart: ingress-nginx-3.4.0
560 | app.kubernetes.io/name: ingress-nginx
561 | app.kubernetes.io/instance: ingress-nginx
562 | app.kubernetes.io/version: 0.40.1
563 | app.kubernetes.io/managed-by: Helm
564 | app.kubernetes.io/component: admission-webhook
565 | namespace: ingress-nginx
566 | roleRef:
567 | apiGroup: rbac.authorization.k8s.io
568 | kind: Role
569 | name: ingress-nginx-admission
570 | subjects:
571 | - kind: ServiceAccount
572 | name: ingress-nginx-admission
573 | namespace: ingress-nginx
574 | ---
575 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/job-createSecret.yaml
576 | apiVersion: batch/v1
577 | kind: Job
578 | metadata:
579 | name: ingress-nginx-admission-create
580 | annotations:
581 | helm.sh/hook: pre-install,pre-upgrade
582 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
583 | labels:
584 | helm.sh/chart: ingress-nginx-3.4.0
585 | app.kubernetes.io/name: ingress-nginx
586 | app.kubernetes.io/instance: ingress-nginx
587 | app.kubernetes.io/version: 0.40.1
588 | app.kubernetes.io/managed-by: Helm
589 | app.kubernetes.io/component: admission-webhook
590 | namespace: ingress-nginx
591 | spec:
592 | template:
593 | metadata:
594 | name: ingress-nginx-admission-create
595 | labels:
596 | helm.sh/chart: ingress-nginx-3.4.0
597 | app.kubernetes.io/name: ingress-nginx
598 | app.kubernetes.io/instance: ingress-nginx
599 | app.kubernetes.io/version: 0.40.1
600 | app.kubernetes.io/managed-by: Helm
601 | app.kubernetes.io/component: admission-webhook
602 | spec:
603 | containers:
604 | - name: create
605 | image: docker.io/jettech/kube-webhook-certgen:v1.3.0
606 | imagePullPolicy: IfNotPresent
607 | args:
608 | - create
609 | - --host=ingress-nginx-controller-admission,ingress-nginx-controller-admission.$(POD_NAMESPACE).svc
610 | - --namespace=$(POD_NAMESPACE)
611 | - --secret-name=ingress-nginx-admission
612 | env:
613 | - name: POD_NAMESPACE
614 | valueFrom:
615 | fieldRef:
616 | fieldPath: metadata.namespace
617 | restartPolicy: OnFailure
618 | serviceAccountName: ingress-nginx-admission
619 | securityContext:
620 | runAsNonRoot: true
621 | runAsUser: 2000
622 | ---
623 | # Source: ingress-nginx/templates/admission-webhooks/job-patch/job-patchWebhook.yaml
624 | apiVersion: batch/v1
625 | kind: Job
626 | metadata:
627 | name: ingress-nginx-admission-patch
628 | annotations:
629 | helm.sh/hook: post-install,post-upgrade
630 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
631 | labels:
632 | helm.sh/chart: ingress-nginx-3.4.0
633 | app.kubernetes.io/name: ingress-nginx
634 | app.kubernetes.io/instance: ingress-nginx
635 | app.kubernetes.io/version: 0.40.1
636 | app.kubernetes.io/managed-by: Helm
637 | app.kubernetes.io/component: admission-webhook
638 | namespace: ingress-nginx
639 | spec:
640 | template:
641 | metadata:
642 | name: ingress-nginx-admission-patch
643 | labels:
644 | helm.sh/chart: ingress-nginx-3.4.0
645 | app.kubernetes.io/name: ingress-nginx
646 | app.kubernetes.io/instance: ingress-nginx
647 | app.kubernetes.io/version: 0.40.1
648 | app.kubernetes.io/managed-by: Helm
649 | app.kubernetes.io/component: admission-webhook
650 | spec:
651 | containers:
652 | - name: patch
653 | image: docker.io/jettech/kube-webhook-certgen:v1.3.0
654 | imagePullPolicy: IfNotPresent
655 | args:
656 | - patch
657 | - --webhook-name=ingress-nginx-admission
658 | - --namespace=$(POD_NAMESPACE)
659 | - --patch-mutating=false
660 | - --secret-name=ingress-nginx-admission
661 | - --patch-failure-policy=Fail
662 | env:
663 | - name: POD_NAMESPACE
664 | valueFrom:
665 | fieldRef:
666 | fieldPath: metadata.namespace
667 | restartPolicy: OnFailure
668 | serviceAccountName: ingress-nginx-admission
669 | securityContext:
670 | runAsNonRoot: true
671 | runAsUser: 2000
--------------------------------------------------------------------------------
/spark-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2018 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | ARG SPARK_IMAGE=gcr.io/spark-operator/spark-py:v3.0.0-hadoop3
18 | FROM ${SPARK_IMAGE}
19 |
20 | USER root
21 |
22 | # Setup for the Prometheus JMX exporter.
23 | # Add the Prometheus JMX exporter Java agent jar for exposing metrics sent to the JmxSink to Prometheus.
24 | ADD https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.11.0/jmx_prometheus_javaagent-0.11.0.jar /opt/spark/jars/jmx_prometheus_javaagent.jar
25 | RUN chmod 644 /opt/spark/jars/jmx_prometheus_javaagent.jar
26 |
27 | RUN mkdir -p /etc/metrics/conf
28 | COPY conf/metrics.properties /etc/metrics/conf
29 | COPY conf/prometheus.yaml /etc/metrics/conf
30 |
31 | # add s3a connector
32 | ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.1.0/hadoop-aws-3.1.0.jar $SPARK_HOME/jars
33 | ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.874/aws-java-sdk-bundle-1.11.874.jar $SPARK_HOME/jars
34 |
35 | ENTRYPOINT [ "/opt/entrypoint.sh" ]
36 |
--------------------------------------------------------------------------------
/spark-docker/conf/metrics.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2018 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | *.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
18 | driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
19 | executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
20 |
--------------------------------------------------------------------------------
/spark-docker/conf/prometheus.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2018 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | ---
18 | lowercaseOutputName: true
19 | attrNameSnakeCase: true
20 | rules:
21 | # These come from the application driver if it's a streaming application
22 | # Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay
23 | - pattern: metrics<>Value
24 | name: spark_streaming_driver_$4
25 | labels:
26 | app_namespace: "$1"
27 | app_id: "$2"
28 | # These come from the application driver if it's a structured streaming application
29 | # Example: default/streaming.driver.spark.streaming.QueryName.inputRate-total
30 | - pattern: metrics<>Value
31 | name: spark_structured_streaming_driver_$4
32 | labels:
33 | app_namespace: "$1"
34 | app_id: "$2"
35 | query_name: "$3"
36 | # These come from the application executors
37 | # Example: default/spark-pi.0.executor.threadpool.activeTasks
38 | - pattern: metrics<>Value
39 | name: spark_executor_$4
40 | type: GAUGE
41 | labels:
42 | app_namespace: "$1"
43 | app_id: "$2"
44 | executor_id: "$3"
45 | # These come from the application driver
46 | # Example: default/spark-pi.driver.DAGScheduler.stage.failedStages
47 | - pattern: metrics<>Value
48 | name: spark_driver_$3_$4
49 | type: GAUGE
50 | labels:
51 | app_namespace: "$1"
52 | app_id: "$2"
53 | # These come from the application driver
54 | # Emulate timers for DAGScheduler like messagePRocessingTime
55 | - pattern: metrics<>Count
56 | name: spark_driver_DAGScheduler_$3_count
57 | type: COUNTER
58 | labels:
59 | app_namespace: "$1"
60 | app_id: "$2"
61 | # HiveExternalCatalog is of type counter
62 | - pattern: metrics<>Count
63 | name: spark_driver_HiveExternalCatalog_$3_count
64 | type: COUNTER
65 | labels:
66 | app_namespace: "$1"
67 | app_id: "$2"
68 | # These come from the application driver
69 | # Emulate histograms for CodeGenerator
70 | - pattern: metrics<>Count
71 | name: spark_driver_CodeGenerator_$3_count
72 | type: COUNTER
73 | labels:
74 | app_namespace: "$1"
75 | app_id: "$2"
76 | # These come from the application driver
77 | # Emulate timer (keep only count attribute) plus counters for LiveListenerBus
78 | - pattern: metrics<>Count
79 | name: spark_driver_LiveListenerBus_$3_count
80 | type: COUNTER
81 | labels:
82 | app_namespace: "$1"
83 | app_id: "$2"
84 | # Get Gauge type metrics for LiveListenerBus
85 | - pattern: metrics<>Value
86 | name: spark_driver_LiveListenerBus_$3
87 | type: GAUGE
88 | labels:
89 | app_namespace: "$1"
90 | app_id: "$2"
91 | # Executors counters
92 | - pattern: metrics<>Count
93 | name: spark_executor_$4_count
94 | type: COUNTER
95 | labels:
96 | app_namespace: "$1"
97 | app_id: "$2"
98 | executor_id: "$3"
99 | # These come from the application executors
100 | # Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks
101 | - pattern: metrics<>Value
102 | name: spark_executor_$4_$5
103 | type: GAUGE
104 | labels:
105 | app_namespace: "$1"
106 | app_id: "$2"
107 | executor_id: "$3"
108 | - pattern: metrics<>Count
109 | name: spark_executor_HiveExternalCatalog_$4_count
110 | type: COUNTER
111 | labels:
112 | app_namespace: "$1"
113 | app_id: "$2"
114 | executor_id: "$3"
115 | # These come from the application driver
116 | # Emulate histograms for CodeGenerator
117 | - pattern: metrics<>Count
118 | name: spark_executor_CodeGenerator_$4_count
119 | type: COUNTER
120 | labels:
121 | app_namespace: "$1"
122 | app_id: "$2"
123 | executor_id: "$3"
--------------------------------------------------------------------------------
/vanilla-k8s/argo-workflows/argo-cm.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | config: |
4 | containerRuntimeExecutor: k8sapi
5 | kind: ConfigMap
6 | metadata:
7 | annotations:
8 | kubectl.kubernetes.io/last-applied-configuration: |
9 | {"apiVersion":"v1","kind":"ConfigMap","metadata":{"annotations":{},"name":"workflow-controller-configmap","namespace":"argo"}}
10 | name: workflow-controller-configmap
11 |
--------------------------------------------------------------------------------
/vanilla-k8s/argo-workflows/argo-workflow-sa.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: argo-workflow-role
5 | roleRef:
6 | apiGroup: rbac.authorization.k8s.io
7 | kind: ClusterRole
8 | name: edit
9 | subjects:
10 | - kind: ServiceAccount
11 | name: default
12 | namespace: default
13 | ---
14 | apiVersion: rbac.authorization.k8s.io/v1
15 | kind: Role
16 | metadata:
17 | namespace: default
18 | name: spark-op-role
19 | rules:
20 | - apiGroups: ["sparkoperator.k8s.io"]
21 | resources: ["sparkapplications"]
22 | verbs: ["*"]
23 | ---
24 | apiVersion: rbac.authorization.k8s.io/v1
25 | kind: RoleBinding
26 | metadata:
27 | name: spark-op-role-binding
28 | namespace: default
29 | subjects:
30 | - kind: ServiceAccount
31 | name: spark
32 | namespace: default
33 | roleRef:
34 | kind: Role
35 | name: spark-op-role
36 | apiGroup: rbac.authorization.k8s.io
--------------------------------------------------------------------------------
/vanilla-k8s/argo-workflows/hello-world-dag.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: dag-diamond-
5 | spec:
6 | entrypoint: diamond
7 | templates:
8 | - name: echo
9 | inputs:
10 | parameters:
11 | - name: message
12 | container:
13 | image: alpine:3.7
14 | command: [echo, "{{inputs.parameters.message}}"]
15 | - name: diamond
16 | dag:
17 | tasks:
18 | - name: A
19 | template: echo
20 | arguments:
21 | parameters: [{name: message, value: A}]
22 | - name: B
23 | dependencies: [A]
24 | template: echo
25 | arguments:
26 | parameters: [{name: message, value: B}]
27 | - name: C
28 | dependencies: [A]
29 | template: echo
30 | arguments:
31 | parameters: [{name: message, value: C}]
32 | - name: D
33 | dependencies: [B, C]
34 | template: echo
35 | arguments:
36 | parameters: [{name: message, value: D}]
--------------------------------------------------------------------------------
/vanilla-k8s/argo-workflows/spark-kubernetes-dag.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: spark-kubernetes-dag
5 | spec:
6 | entrypoint: sparkling
7 | templates:
8 | - name: sparkpi
9 | container:
10 | image: k8s-test/spark:v2.4.4
11 | imagePullPolicy: Never
12 | command: [
13 | "/bin/sh",
14 | "-c",
15 | "/opt/spark/bin/spark-submit \
16 | --master k8s://https://172.18.0.2:6443 \
17 | --deploy-mode cluster \
18 | --name spark-pi \
19 | --class org.apache.spark.examples.SparkPi \
20 | --conf spark.executor.instances=2 \
21 | --conf spark.kubernetes.container.image=k8s-test/spark-on-localhost:v1.0 \
22 | --conf spark.kubernetes.container.image.pullPolicy=Never \
23 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
24 | --executor-memory 500M \
25 | /opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar 100"
26 | ]
27 | - name: spark-eks
28 | container:
29 | image: k8s-test/spark:v2.4.4
30 | imagePullPolicy: Never
31 | command: [
32 | "/bin/sh",
33 | "-c",
34 | "/opt/spark/bin/spark-submit \
35 | --master k8s://https://172.18.0.2:6443 \
36 | --deploy-mode cluster \
37 | --name spark-on-localhost \
38 | --class ValueZones \
39 | --conf spark.executor.instances=1 \
40 | --conf spark.executor.memory=2G \
41 | --conf spark.executor.cores=2 \
42 | --conf spark.sql.shuffle.partitions=60 \
43 | --conf spark.kubernetes.container.image=k8s-test/spark-on-localhost:v1.0 \
44 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
45 | --conf spark.kubernetes.container.image.pullPolicy=Never \
46 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
47 | local:///opt/spark/jars/spark-on-eks-assembly-v1.0.jar \
48 | \"s3a://nyc-tlc/trip data/yellow_tripdata_2018*.csv\" \
49 | \"s3a://nyc-tlc/trip data/green_tripdata_2018*.csv\" \
50 | \"s3a://nyc-tlc/misc/taxi _zone_lookup.csv\" \
51 | \"local:///tmp/\""
52 | ]
53 | - name: sparkling
54 | dag:
55 | tasks:
56 | - name: SparkPi1
57 | template: sparkpi
58 | - name: SparkPi2
59 | dependencies: [SparkPi1]
60 | template: sparkpi
61 | - name: SparkPi3
62 | dependencies: [SparkPi1]
63 | template: sparkpi
64 |
--------------------------------------------------------------------------------
/vanilla-k8s/argo-workflows/spark-operator-kubernetes-dag.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: spark-kubernetes-dag
5 | spec:
6 | entrypoint: sparkling-operator
7 | serviceAccountName: spark
8 | templates:
9 | - name: sparkpi
10 | resource:
11 | action: create
12 | successCondition: status.applicationState.state in (COMPLETED)
13 | failureCondition: 'status.applicationState.state in (FAILED, SUBMISSION_FAILED, UNKNOWN)'
14 | manifest: |
15 | apiVersion: "sparkoperator.k8s.io/v1beta2"
16 | kind: SparkApplication
17 | metadata:
18 | generateName: spark-pi
19 | spec:
20 | type: Scala
21 | mode: cluster
22 | image: "gcr.io/spark-operator/spark:v3.0.0"
23 | imagePullPolicy: Always
24 | mainClass: org.apache.spark.examples.SparkPi
25 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
26 | sparkVersion: "3.0.0"
27 | restartPolicy:
28 | type: Never
29 | volumes:
30 | - name: "test-volume"
31 | hostPath:
32 | path: "/tmp"
33 | type: Directory
34 | driver:
35 | cores: 1
36 | coreLimit: "1200m"
37 | memory: "512m"
38 | labels:
39 | version: 3.0.0
40 | serviceAccount: spark
41 | volumeMounts:
42 | - name: "test-volume"
43 | mountPath: "/tmp"
44 | executor:
45 | cores: 1
46 | instances: 1
47 | memory: "512m"
48 | labels:
49 | version: 3.0.0
50 | volumeMounts:
51 | - name: "test-volume"
52 | mountPath: "/tmp"
53 | - name: sparkling-operator
54 | dag:
55 | tasks:
56 | - name: SparkPi1
57 | template: sparkpi
58 | - name: SparkPi2
59 | dependencies: [SparkPi1]
60 | template: sparkpi
61 | - name: SparkPi3
62 | dependencies: [SparkPi1]
63 | template: sparkpi
64 |
--------------------------------------------------------------------------------
/vanilla-k8s/spark-operator/spark-application.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
2 | kind: SparkApplication
3 | metadata:
4 | name: spark-pi
5 | namespace: default
6 | spec:
7 | type: Scala
8 | mode: cluster
9 | image: "gcr.io/spark-operator/spark:v3.0.0"
10 | imagePullPolicy: Always
11 | mainClass: org.apache.spark.examples.SparkPi
12 | mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.0.0.jar"
13 | sparkVersion: "3.0.0"
14 | restartPolicy:
15 | type: Never
16 | volumes:
17 | - name: "test-volume"
18 | hostPath:
19 | path: "/tmp"
20 | type: Directory
21 | driver:
22 | cores: 1
23 | coreLimit: "1200m"
24 | memory: "512m"
25 | labels:
26 | version: 3.0.0
27 | serviceAccount: spark
28 | volumeMounts:
29 | - name: "test-volume"
30 | mountPath: "/tmp"
31 | executor:
32 | cores: 1
33 | instances: 1
34 | memory: "512m"
35 | labels:
36 | version: 3.0.0
37 | volumeMounts:
38 | - name: "test-volume"
39 | mountPath: "/tmp"
--------------------------------------------------------------------------------
/vanilla-k8s/spark-submit/test-job-eks.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: spark-on-localhost
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: spark
10 | image: k8s-test/spark:v2.4.4
11 | imagePullPolicy: Never
12 | command: [
13 | "/bin/sh",
14 | "-c",
15 | "/opt/spark/bin/spark-submit \
16 | --master k8s://https://172.18.0.2:6443 \
17 | --deploy-mode cluster \
18 | --name spark-on-localhost \
19 | --class ValueZones \
20 | --conf spark.executor.instances=1 \
21 | --conf spark.executor.memory=2G \
22 | --conf spark.executor.cores=2 \
23 | --conf spark.sql.shuffle.partitions=60 \
24 | --conf spark.kubernetes.container.image=k8s-test/spark-on-localhost:v1.0 \
25 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
26 | --conf spark.kubernetes.container.image.pullPolicy=Never \
27 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
28 | local:///opt/spark/jars/spark-on-eks-assembly-v1.0.jar \
29 | \"s3a://nyc-tlc/trip data/yellow_tripdata_2018*.csv\" \
30 | \"s3a://nyc-tlc/trip data/green_tripdata_2018*.csv\" \
31 | \"s3a://nyc-tlc/misc/taxi _zone_lookup.csv\" \
32 | \"local:///tmp/\""
33 | ]
34 | serviceAccountName: spark
35 | restartPolicy: Never
36 | backoffLimit: 4
--------------------------------------------------------------------------------
/vanilla-k8s/spark-submit/test-job-example.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: spark-on-localhost-example
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: spark
10 | image: k8s-test/spark:v2.4.4
11 | imagePullPolicy: Never
12 | command: [
13 | "/bin/sh",
14 | "-c",
15 | "/opt/spark/bin/spark-submit \
16 | --master k8s://https://172.18.0.2:6443 \
17 | --deploy-mode cluster \
18 | --name spark-pi \
19 | --class org.apache.spark.examples.SparkPi \
20 | --conf spark.executor.instances=2 \
21 | --conf spark.kubernetes.container.image=k8s-test/spark-on-localhost:v1.0 \
22 | --conf spark.kubernetes.container.image.pullPolicy=Never \
23 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
24 | --executor-memory 500M \
25 | /opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar 100"
26 | ]
27 | serviceAccountName: spark
28 | restartPolicy: Never
29 | backoffLimit: 4
--------------------------------------------------------------------------------
/vanilla-k8s/spark-submit/test-sa.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: spark
5 | ---
6 | apiVersion: rbac.authorization.k8s.io/v1
7 | kind: ClusterRoleBinding
8 | metadata:
9 | name: spark-role
10 | roleRef:
11 | apiGroup: rbac.authorization.k8s.io
12 | kind: ClusterRole
13 | name: edit
14 | subjects:
15 | - kind: ServiceAccount
16 | name: spark
17 | namespace: default
--------------------------------------------------------------------------------