├── trino.png
├── kube-build
    ├── Chart.yaml
    ├── templates
    │   ├── kaniko-trino.yaml
    │   └── kaniko-devserver.yaml
    └── deploy.sh
├── tf
    ├── namespace
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── namespace.tf
    ├── outputs.tf
    ├── redis
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── redis.tf
    ├── devserver
    │   ├── variables.tf
    │   └── devserver.tf
    ├── superset
    │   ├── outputs.tf
    │   ├── superset_init.sh
    │   ├── variables.tf
    │   ├── superset_config.py
    │   └── superset.tf
    ├── postgres
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── postgres.tf
    ├── minio
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── minio.tf
    ├── nessie-values.yaml
    ├── trino-values.yaml
    ├── test-hive.sh
    ├── main.tf
    └── variables.tf
├── components.png
├── superset.png
├── superset-external.png
├── docker
    ├── superset
    │   ├── push.sh
    │   ├── build.sh
    │   └── Dockerfile
    ├── devserver
    │   ├── push.sh
    │   ├── build.sh
    │   └── Dockerfile
    └── metastore
    │   ├── push.sh
    │   ├── build.sh
    │   └── Dockerfile
├── .gitignore
├── LICENSE
├── components.drawio
└── README.md


/trino.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/trino.png


--------------------------------------------------------------------------------
/kube-build/Chart.yaml:
--------------------------------------------------------------------------------
1 | name: walden
2 | version: 1 # required by helm
3 | 


--------------------------------------------------------------------------------
/tf/namespace/outputs.tf:
--------------------------------------------------------------------------------
1 | output "name" {
2 |   value = var.name
3 | }
4 | 


--------------------------------------------------------------------------------
/tf/namespace/variables.tf:
--------------------------------------------------------------------------------
1 | variable "name" {
2 |   type = string
3 | }
4 | 


--------------------------------------------------------------------------------
/components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/components.png


--------------------------------------------------------------------------------
/superset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/superset.png


--------------------------------------------------------------------------------
/tf/outputs.tf:
--------------------------------------------------------------------------------
1 | output "namespace" {
2 |   value = module.namespace.name
3 | }
4 | 


--------------------------------------------------------------------------------
/superset-external.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/superset-external.png


--------------------------------------------------------------------------------
/tf/namespace/namespace.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_namespace" "namespace" {
2 |   metadata {
3 |     name = var.name
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/tf/redis/outputs.tf:
--------------------------------------------------------------------------------
1 | output "pass" {
2 |   value = random_password.pass.result
3 |   sensitive = true
4 | }
5 | output "secret_name" {
6 |   value = kubernetes_secret.redis.metadata[0].name
7 | }
8 | 


--------------------------------------------------------------------------------
/tf/devserver/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "namespace" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "image" {
 6 |   type = string
 7 | }
 8 | 
 9 | variable "minio_secret_name" {
10 |   type = string
11 | }
12 | 


--------------------------------------------------------------------------------
/tf/superset/outputs.tf:
--------------------------------------------------------------------------------
1 | output "user" {
2 |   value = var.username
3 |   sensitive = true
4 | }
5 | output "pass" {
6 |   value = var.password == "" ? random_password.admin_pass[0].result : var.password
7 |   sensitive = true
8 | }
9 | 


--------------------------------------------------------------------------------
/docker/superset/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either SUPERSET_TAG or TAG, default is current date
4 | TAG=${SUPERSET_TAG:=superset-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/docker/devserver/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either DEVSERVER_TAG or TAG, default is current date
4 | TAG=${DEVSERVER_TAG:=devserver-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/docker/metastore/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either METASTORE_TAG or TAG, default is current date
4 | TAG=${METASTORE_TAG:=metastore-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/docker/superset/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either SUPERSET_TAG or TAG, default is current date
4 | TAG=${SUPERSET_TAG:=superset-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/docker/devserver/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either DEVSERVER_TAG or TAG, default is current date
4 | TAG=${DEVSERVER_TAG:=devserver-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/docker/metastore/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Allow custom tag via either METASTORE_TAG or TAG, default is current date
4 | TAG=${METASTORE_TAG:=metastore-${TAG:=$(date +%Y.%m.%d)}}
5 | 
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 | 


--------------------------------------------------------------------------------
/tf/postgres/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "user" {
 2 |   value = random_password.user.result
 3 |   sensitive = true
 4 | }
 5 | output "pass" {
 6 |   value = random_password.pass.result
 7 |   sensitive = true
 8 | }
 9 | output "secret_name" {
10 |   value = kubernetes_secret.postgres.metadata[0].name
11 | }
12 | 


--------------------------------------------------------------------------------
/tf/postgres/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "namespace" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "name" {
 6 |   type = string
 7 | }
 8 | 
 9 | variable "image" {
10 |   type = string
11 | }
12 | 
13 | variable "db" {
14 |   type = string
15 | }
16 | 
17 | variable "storage" {
18 |   type = string
19 | }
20 | 


--------------------------------------------------------------------------------
/tf/redis/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "namespace" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "name" {
 6 |   type = string
 7 | }
 8 | 
 9 | variable "image" {
10 |   type = string
11 | }
12 | 
13 | variable "max_memory" {
14 |   type = string
15 | }
16 | variable "storage" {
17 |   type = string
18 | }
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore any files named "values*.yaml", except for "kube/values-default.yaml"
 2 | **/values*.yaml
 3 | !kube/values-default.yaml
 4 | 
 5 | *.iml
 6 | *.iws
 7 | .idea/
 8 | 
 9 | .terraform/
10 | .terraform.lock.hcl
11 | terraform.tfstate
12 | terraform.tfstate.backup
13 | .terraform.tfstate.lock.info
14 | 


--------------------------------------------------------------------------------
/tf/minio/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "user" {
 2 |   value = var.username
 3 |   sensitive = true
 4 | }
 5 | output "pass" {
 6 |   value = var.password == "" ? random_password.admin_pass[0].result : var.password
 7 |   sensitive = true
 8 | }
 9 | output "secret_name" {
10 |   value = kubernetes_secret.minio.metadata[0].name
11 | }
12 | 


--------------------------------------------------------------------------------
/tf/nessie-values.yaml:
--------------------------------------------------------------------------------
 1 | versionStoreType: JDBC2
 2 | jdbc:
 3 |   jdbcUrl: "${jdbc_url}"
 4 |   secret:
 5 |     name: "${jdbc_secret_name}"
 6 |     username: user
 7 |     password: pass
 8 | catalog:
 9 |   iceberg:
10 |     defaultWarehouse: warehouse
11 |     warehouses:
12 |     - name: warehouse
13 |       location: "s3://demobucket/"
14 |   storage:
15 |     s3:
16 |       defaultOptions:
17 |         endpoint: "http://minio:9000/"
18 |         pathStyleAccess: true
19 |       accessKeySecret:
20 |         name: minio
21 |         awsAccessKeyId: user
22 |         awsSecretAccessKey: pass
23 | 


--------------------------------------------------------------------------------
/tf/trino-values.yaml:
--------------------------------------------------------------------------------
 1 | server:
 2 |   workers: ${worker_replicas}
 3 | coordinator:
 4 |   nodeSelector: ${coordinator_node_selector}
 5 |   config:
 6 |     query:
 7 |       maxMemoryPerNode: ${coordinator_query_mem_limit}
 8 |   resources:
 9 |     limits:
10 |       memory: ${coordinator_mem_limit}
11 |     requests:
12 |       memory: ${coordinator_mem_limit}
13 |   jvm:
14 |     maxHeapSize: ${coordinator_max_heap}
15 | worker:
16 |   nodeSelector: ${worker_node_selector}
17 |   config:
18 |     query:
19 |       maxMemoryPerNode: ${worker_query_mem_limit}
20 |   resources:
21 |     limits:
22 |       memory: ${worker_mem_limit}
23 |     requests:
24 |       memory: ${worker_mem_limit}
25 |   jvm:
26 |     maxHeapSize: ${worker_max_heap}
27 | 


--------------------------------------------------------------------------------
/tf/superset/superset_init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -eu
 3 | 
 4 | echo "Upgrading DB schema..."
 5 | superset db upgrade
 6 | 
 7 | echo "Initializing roles..."
 8 | superset init
 9 | 
10 | if [ -n "$ADMIN_USER" -a -n "$ADMIN_PASS" ]; then
11 |     echo "Creating admin user '${ADMIN_USER}'..."
12 |     superset fab create-admin \
13 |              --username "${ADMIN_USER}" \
14 |              --firstname Superset \
15 |              --lastname Admin \
16 |              --email admin@superset.com \
17 |              --password "${ADMIN_PASS}" \
18 |         || true
19 | fi
20 | 
21 | if [ -f "/app/pythonpath/superset_datasources.yaml" ]; then
22 |   echo "Importing database connections..."
23 |   superset import_datasources -p /app/pythonpath/superset_datasources.yaml
24 | fi
25 | 


--------------------------------------------------------------------------------
/kube-build/templates/kaniko-trino.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: kaniko
 6 |   namespace: walden
 7 | spec:
 8 |   nodeSelector:
 9 |     kubernetes.io/arch: amd64
10 |   containers:
11 |   - name: kaniko
12 |     image: gcr.io/kaniko-project/executor:debug
13 |     args:
14 |     - "--dockerfile=Dockerfile"
15 |     - "--context-sub-path=docker/trino"
16 |     - "--context=git://github.com/scie-nz/walden.git"
17 |     - "--destination=scienz/walden-trino"
18 |     volumeMounts:
19 |     - name: docker-config
20 |       mountPath: /kaniko/.docker
21 |   volumes:
22 |     - name: docker-config
23 |       projected:
24 |         sources:
25 |         - secret:
26 |             name: regcred
27 |             items:
28 |               - key: .dockerconfigjson
29 |                 path: config.json
30 |   restartPolicy: Never
31 | 


--------------------------------------------------------------------------------
/kube-build/templates/kaniko-devserver.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: kaniko
 6 |   namespace: walden
 7 | spec:
 8 |   nodeSelector:
 9 |     kubernetes.io/arch: amd64
10 |   containers:
11 |   - name: kaniko
12 |     image: gcr.io/kaniko-project/executor:debug
13 |     args:
14 |     - "--dockerfile=Dockerfile"
15 |     - "--context-sub-path=docker/devserver"
16 |     - "--context=git://github.com/scie-nz/walden.git"
17 |     - "--destination=scienz/walden-devserver"
18 |     volumeMounts:
19 |     - name: docker-config
20 |       mountPath: /kaniko/.docker
21 |   volumes:
22 |     - name: docker-config
23 |       projected:
24 |         sources:
25 |         - secret:
26 |             name: regcred
27 |             items:
28 |               - key: .dockerconfigjson
29 |                 path: config.json
30 |   restartPolicy: Never
31 | 


--------------------------------------------------------------------------------
/tf/test-hive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # script to run in devserver pod
 4 | read -d '' EXEC_SCRIPT << EOF
 5 | echo "mc alias"
 6 | mc alias set walden-minio/ http://minio:9000 \$MINIO_ACCESS_KEY_ID \$MINIO_ACCESS_KEY_SECRET || exit 1
 7 | 
 8 | echo "mc rb"
 9 | mc rb --force walden-minio/direct
10 | 
11 | echo "mc mb"
12 | mc mb walden-minio/direct
13 | 
14 | trino_cmd() {
15 |   echo "trino: \$1"
16 |   trino-cli --server trino --user walden --execute="\$1"
17 | }
18 | 
19 | trino_cmd "DROP TABLE IF EXISTS hive.direct.dim_foo"
20 | trino_cmd "DROP SCHEMA IF EXISTS hive.direct"
21 | trino_cmd "CREATE SCHEMA hive.direct WITH (location='s3a://direct/')"
22 | trino_cmd "CREATE TABLE hive.direct.dim_foo(key VARCHAR, val BIGINT)"
23 | trino_cmd "INSERT INTO hive.direct.dim_foo VALUES ('this', 1), ('is', 2), ('a', 3), ('test', 4)"
24 | trino_cmd "SELECT key, val FROM hive.direct.dim_foo"
25 | 
26 | echo "mc ls"
27 | mc ls -r walden-minio/direct
28 | EOF
29 | 
30 | kubectl exec -it -n walden deployment/devserver -- /bin/bash -c "$EXEC_SCRIPT"
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SCIE.NZ
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/kube-build/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 4 | 
 5 | # strict mode
 6 | set -euo pipefail
 7 | IFS=$'\n\t'
 8 | 
 9 | # print line on error
10 | err_report() {
11 |     echo "Error on line $1"
12 | }
13 | trap 'err_report $LINENO' ERR
14 | 
15 | # set namespace, then reset back to current afterwards
16 | # this allows us to apply across namespaces in a single 'apply' command, while still having an assigned default
17 | TARGET_NAMESPACE=walden
18 | ORIG_NAMESPACE=$(kubectl config view --minify --output 'jsonpath={..namespace}')
19 | if [ -z "$ORIG_NAMESPACE" ]; then
20 |     ORIG_NAMESPACE=$TARGET_NAMESPACE
21 | fi
22 | 
23 | reset_namespace() {
24 |     echo "Switching back to namespace: $ORIG_NAMESPACE"
25 |     kubectl config set-context --current --namespace=$ORIG_NAMESPACE
26 | }
27 | trap reset_namespace EXIT
28 | 
29 | echo "Switching to namespace: $TARGET_NAMESPACE"
30 | # if namespace doesn't exist, create it
31 | kubectl create namespace $TARGET_NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
32 | kubectl config set-context --current --namespace=$TARGET_NAMESPACE
33 | 
34 | helm template -g $SCRIPT_DIR | kubectl apply -f -
35 | 


--------------------------------------------------------------------------------
/tf/minio/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "namespace" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "name" {
 6 |   type = string
 7 | }
 8 | 
 9 | variable "image" {
10 |   type = string
11 | }
12 | 
13 | variable "username" {
14 |   type = string
15 |   validation {
16 |     condition = length(var.username) >= 3
17 |     error_message = "Minio username must be at least 3 characters long"
18 |   }
19 | }
20 | variable "password" {
21 |   type = string
22 |   validation {
23 |     condition = var.password == "" || length(var.password) >= 8
24 |     error_message = "Minio password must be at least 8 characters long"
25 |   }
26 | }
27 | variable "replicas" {
28 |   type = number
29 |   validation {
30 |     condition = var.replicas == 1 || var.replicas >= 4
31 |     error_message = "Minio requires a minimum of four replicas"
32 |   }
33 | }
34 | variable "mem_limit" {
35 |   type = string
36 | }
37 | variable "node_selector" {
38 |   type = map
39 | }
40 | variable "tolerations" {
41 |   type = list(object({
42 |     effect = string
43 |     key = string
44 |     operator = string
45 |     value = string
46 |   }))
47 | }
48 | variable "storage" {
49 |   type = string
50 | }
51 | variable "nfs_server" {
52 |   type = string
53 | }
54 | variable "nfs_path" {
55 |   type = string
56 | }
57 | 


--------------------------------------------------------------------------------
/docker/superset/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Based on the latest stable release
 2 | # check https://hub.docker.com/r/apache/superset/tags (search for '1.' to get releases)
 3 | FROM apache/superset:4.1.1
 4 | 
 5 | USER root
 6 | # Geckodriver prerequisites
 7 | RUN apt-get update \
 8 |   && apt-get -y install --no-install-recommends firefox-esr \
 9 |   && apt-get -y upgrade \
10 |   && apt-get clean \
11 |   && rm -rf /var/lib/apt/lists /var/cache/apt/archives
12 | 
13 | # Install geckodriver to support rendering charts in email alerts
14 | ENV GECKODRIVER_VERSION=0.35.0
15 | RUN wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz \
16 |   && tar -x geckodriver -zf geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz -O > /usr/bin/geckodriver \
17 |   && rm geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz \
18 |   && chmod 755 /usr/bin/geckodriver \
19 |   && geckodriver --version
20 | USER superset
21 | 
22 | # Install a selection of drivers for connecting Superset to various database types and auth integrations.
23 | #
24 | # See here for full list of supported DBs and their connection strings:
25 | #   https://superset.apache.org/docs/configuration/databases/
26 | #
27 | # - authlib + flask-oidc: Support OIDC auth
28 | # - trino: Support talking to Trino in Walden
29 | RUN pip3 install --break-system-packages \
30 |   authlib==1.4.0 \
31 |   flask-oidc==2.2.2 \
32 |   \
33 |   trino==0.332.0
34 | 


--------------------------------------------------------------------------------
/docker/devserver/Dockerfile:
--------------------------------------------------------------------------------
 1 | # - Trino wants java 23: https://trino.io/docs/current/installation/deployment.html
 2 | # - Ubuntu only has openjdk-23-jre in 24.10+ (not in 24.04LTS): https://packages.ubuntu.com/search?keywords=openjdk-23-jre&searchon=names&suite=all&section=all
 3 | # So we go with 24.10:
 4 | FROM ubuntu:24.10
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | RUN apt-get update \
 9 |   && apt-get -y install curl git gnupg2 less openjdk-23-jre python3-pip tzdata unzip vim wget \
10 |   && apt-get -y upgrade \
11 |   && apt-get clean \
12 |   && rm -rf /var/lib/apt/lists /var/cache/apt/archives
13 | 
14 | ENV TRINO_VERSION=468 \
15 |   TRINO_PY_VERSION=0.332.0 \
16 |   MINIO_VERSION=RELEASE.2025-01-17T23-25-50Z \
17 |   MINIO_PY_VERSION=7.2.15
18 | 
19 | RUN \
20 |   curl -o /usr/bin/mc https://dl.min.io/client/mc/release/linux-amd64/archive/mc.${MINIO_VERSION} \
21 |   && chmod +x /usr/bin/mc \
22 |   && mc --version \
23 |   \
24 |   && curl -o /usr/bin/trino-cli https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar \
25 |   && echo 'trino-cli --server trino --user "${TRINO_USER}" --catalog hive --schema $1' > /usr/bin/trino \
26 |   && chmod +x /usr/bin/trino-cli /usr/bin/trino \
27 |   && trino-cli --version \
28 |   \
29 |   && pip3 install --break-system-packages minio==${MINIO_PY_VERSION} trino==${TRINO_PY_VERSION} \
30 |   && python3 -c "import minio; print('minio {}'.format(minio.__version__))" \
31 |   && python3 -c "import trino; print('trino {}'.format(trino.__version__))"
32 | 


--------------------------------------------------------------------------------
/components.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="Electron" modified="2022-05-17T04:26:46.321Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/18.0.1 Chrome/100.0.4896.143 Electron/18.2.0 Safari/537.36" etag="4yrHwfnPgcWfRbssmZyl" version="18.0.1" type="device"><diagram id="Jm-96UzJ0BxqFpu-4aCq" name="Page-1">7Vpbb9owFP41PHZKYhLCY4FunUbVdmxqu5fJTdzgYeLIMQX262c3DoljKGkVbtVekH1iO/Z3vnNzaIH+dPGFwWR8RUNEWo4VLlpg0HKcjuWIXylYZgK/AzJBxHCYiexCMMJ/kRJaSjrDIUq1gZxSwnGiCwMaxyjgmgwyRuf6sCdK9LcmMEKGYBRAYkrvcMjH6hSuVcgvEY7G+ZttSz2ZwnywEqRjGNJ5SQQuWqDPKOVZa7roIyKxy3HJ5n3e8HS1MYZiXmfCN3/4C/yGwblzffazfX0bXkySM6ebLfMMyUydWO2WL3MIGJ3FIZKrWC3Qm48xR6MEBvLpXOhcyMZ8SkTPFk21HGIcLTZu1F4dX9AG0SnibCmGqAleRyGmKOPmkM4LBdg5quMy+PlAqJQerdYucBENBc1bYLKOD6ZOW4cJeAeHyfaPDybX02Gyu3VhAjtjk7sdJhQKN6S6lPExjWgMyUUh7elAFmOGlCYKvj+I86XyqXDG6Tpw5Yteh1bsi85YgF5TvPLMkEWIvzbOWa8rhgjk+FnfSPP0NGD/wXBMW45HxKZ7j0y0ItnqU8pCHENOmaGYLYyFaZKFoSe8kMpphML2dkv31zDY3xmBDSBvaMojhka3QwMwEfcS2QyWBAvCMrDd7B8zag8fVwIYTKIXwl/PuFgGKXmacdt2G3KpFV8B1viKdUB7uwLadBQjwUmZspwwyl63Et/BgVHufFA6u74OtOMeGGgzPfiOQpx+KIydA2Nsm6nqhih3R9kEMRP9A0Q4r5rL+geOcHbtXOGYUAQVFDuHRtFMFE4ARefIsi3bjE+jWSLQEql2FS9xbK5DlHJGJ6hPichlwSCm0hH2njAhFREkOIqlzxUwISHvSRBxAMm5ejDFYfhSeKzTiV6MNOFXa5Rt3hot7K5oMz3CpaxYHOsKcZiKtMxMyj6MNurcNXT3qo0aNw0oDs/lDaCEkcA0xYGuCx0lgQRb3pc7D7Lzyc27g0X54WCZ9xaY36sVZbs0S/SKSbKz1DTSTOVdt/Qu3TlqGi5p0F2jwVxWu0JXb7ihWBxlYwnbrjIjO6ia5ZRuMSsLVS8HvarbzYAwFnph2erY7yceMNOrKxx/vf64tl/nOnavnhiYnrgx2//UcTXzr2n8hcE/lFzB7o3fr2n8GzR8YrZfzQnAvm3fTGebI55zSsQD4CSIZ+l8WaUobyaef2DiAYN4A/ScIiYcu1lLJTQ0aLnvry7tiqk6NYuonV2LgPb/oKF/y9yeMXaPKmo4TUWNzp6Nd4elymlFjfq1ynExr/1e5rmVzNluinmiW/xxJBte/PsGXPwD</diagram></mxfile>


--------------------------------------------------------------------------------
/tf/superset/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "namespace" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "image_busybox" {
 6 |   type = string
 7 | }
 8 | variable "image_superset" {
 9 |   type = string
10 | }
11 | 
12 | variable "username" {
13 |   type = string
14 | }
15 | variable "password" {
16 |   type = string
17 | }
18 | variable "worker_replicas" {
19 |   type = number
20 | }
21 | variable "mem_limit_server" {
22 |   type = string
23 | }
24 | variable "mem_limit_worker" {
25 |   type = string
26 | }
27 | 
28 | variable "postgres_host" {
29 |   type = string
30 | }
31 | variable "postgres_port" {
32 |   type = number
33 | }
34 | variable "postgres_db" {
35 |   type = string
36 | }
37 | variable "postgres_secret_name" {
38 |   type = string
39 | }
40 | 
41 | variable "redis_host" {
42 |   type = string
43 | }
44 | variable "redis_secret_name" {
45 |   type = string
46 | }
47 | 
48 | variable "extra_datasources" {
49 |   type = string
50 | }
51 | 
52 | variable "scheduler_node_selector" {
53 |   type = map
54 | }
55 | variable "worker_node_selector" {
56 |   type = map
57 | }
58 | variable "app_node_selector" {
59 |   type = map
60 | }
61 | variable "scheduler_tolerations" {
62 |   type = list(object({
63 |     effect = string
64 |     key = string
65 |     operator = string
66 |     value = string
67 |   }))
68 | }
69 | variable "worker_tolerations" {
70 |   type = list(object({
71 |     effect = string
72 |     key = string
73 |     operator = string
74 |     value = string
75 |   }))
76 | }
77 | variable "app_tolerations" {
78 |   type = list(object({
79 |     effect = string
80 |     key = string
81 |     operator = string
82 |     value = string
83 |   }))
84 | }
85 | 


--------------------------------------------------------------------------------
/tf/devserver/devserver.tf:
--------------------------------------------------------------------------------
 1 | resource "kubernetes_persistent_volume_claim" "devserver" {
 2 |   metadata {
 3 |     name = "devserver"
 4 |     namespace = var.namespace
 5 |   }
 6 |   spec {
 7 |     access_modes = [
 8 |       "ReadWriteOnce",
 9 |     ]
10 |     resources {
11 |       requests = {
12 |         storage = "1Gi"
13 |       }
14 |     }
15 |   }
16 | }
17 | 
18 | resource "kubernetes_deployment" "devserver" {
19 |   metadata {
20 |     name = "devserver"
21 |     namespace = var.namespace
22 |   }
23 |   spec {
24 |     selector {
25 |       match_labels = {
26 |         app = "devserver"
27 |       }
28 |     }
29 |     strategy {
30 |       type = "Recreate"
31 |     }
32 |     template {
33 |       metadata {
34 |         labels = {
35 |           app = "devserver"
36 |         }
37 |       }
38 |       spec {
39 |         container {
40 |           command = [
41 |             "/bin/bash",
42 |             "-c",
43 |             "cd ~ && sleep infinity",
44 |           ]
45 |           env {
46 |             name = "MINIO_ACCESS_KEY_ID"
47 |             value_from {
48 |               secret_key_ref {
49 |                 key = "user"
50 |                 name = var.minio_secret_name
51 |               }
52 |             }
53 |           }
54 |           env {
55 |             name = "MINIO_ACCESS_KEY_SECRET"
56 |             value_from {
57 |               secret_key_ref {
58 |                 key = "pass"
59 |                 name = var.minio_secret_name
60 |               }
61 |             }
62 |           }
63 |           image = var.image
64 |           name = "devserver"
65 |           resources {
66 |             limits = {
67 |               memory = "4096Mi"
68 |             }
69 |             requests = {
70 |               memory = "2048Mi"
71 |             }
72 |           }
73 |           volume_mount {
74 |             mount_path = "/root"
75 |             name = "devserver-persistent-storage"
76 |           }
77 |         }
78 |         node_selector = {
79 |           "kubernetes.io/arch" = "amd64"
80 |         }
81 |         volume {
82 |           name = "devserver-persistent-storage"
83 |           persistent_volume_claim {
84 |             claim_name = "devserver"
85 |           }
86 |         }
87 |       }
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/docker/metastore/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Loosely based on: https://techjogging.com/standalone-hive-metastore-presto-docker.html
 2 | 
 3 | # Use current LTS
 4 | FROM ubuntu:22.04
 5 | 
 6 | ENV HADOOP_VERSION=3.3.1 \
 7 |   METASTORE_VERSION=3.1.3 \
 8 |   POSTGRES_JDBC_VERSION=42.3.2 \
 9 |   AWS_SDK_JAR_VERSION=1.11.901 \
10 |   HADOOP_HOME=/opt/hadoop \
11 |   HIVE_HOME=/opt/hive-metastore \
12 |   DEBIAN_FRONTEND="noninteractive"
13 | 
14 | RUN mkdir -p $HIVE_HOME/lib \
15 |   && mkdir -p $HADOOP_HOME \
16 |   && chmod a+rw $HIVE_HOME \
17 |   && chmod a+rw $HIVE_HOME/lib \
18 |   && chmod a+rw $HADOOP_HOME \
19 |   \
20 |   && apt-get update \
21 |   && apt-get -y install gnupg2 curl openjdk-8-jre \
22 |   && apt-get clean \
23 |   && rm -rf /var/lib/apt/lists /var/cache/apt/archives \
24 |   \
25 |   && curl -L https://repo1.maven.org/maven2/org/apache/hive/hive-standalone-metastore/${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - \
26 |   && mv apache-hive-metastore-${METASTORE_VERSION}-bin/* $HIVE_HOME \
27 |   && rmdir -v apache-hive-metastore-${METASTORE_VERSION}-bin \
28 |   \
29 |   && curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - \
30 |   && mv -v hadoop-${HADOOP_VERSION}/* $HADOOP_HOME \
31 |   && rmdir -v hadoop-${HADOOP_VERSION} \
32 |   \
33 |   && curl -O https://jdbc.postgresql.org/download/postgresql-${POSTGRES_JDBC_VERSION}.jar \
34 |   && mv -v postgresql-${POSTGRES_JDBC_VERSION}.jar $HIVE_HOME/lib/ \
35 |   \
36 |   && stat $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar \
37 |   && cp -v $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar $HIVE_HOME/lib/ \
38 |   && cp -v $HADOOP_HOME/share/hadoop/tools/lib/hadoop-aws-$HADOOP_VERSION.jar $HIVE_HOME/lib/ \
39 |   \
40 |   && rm -v $HIVE_HOME/lib/guava-*.jar \
41 |   && stat $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar \
42 |   && cp -v $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/
43 | 
44 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
45 |   HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HIVE_HOME/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar:$HIVE_HOME/lib/hadoop-aws-$HADOOP_VERSION.jar:$HIVE_HOME/lib/postgresql-$POSTGRES_JDBC_VERSION.jar \
46 |   METASTORE_AUX_JARS_PATH=$HIVE_HOME/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar:$HIVE_HOME/lib/hadoop-aws-$HADOOP_VERSION.jar
47 | 


--------------------------------------------------------------------------------
/tf/superset/superset_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from celery.schedules import crontab
 3 | from cachelib.redis import RedisCache
 4 | 
 5 | def env(key, default=None):
 6 |     val = os.getenv(key, default)
 7 |     if val is None:
 8 |         raise Exception("Missing required envvar: {}".format(key))
 9 |     return val
10 | 
11 | MAPBOX_API_KEY = env('MAPBOX_API_KEY', '')
12 | CACHE_CONFIG = {
13 |     'CACHE_TYPE': 'redis',
14 |     'CACHE_DEFAULT_TIMEOUT': 300,
15 |     'CACHE_KEY_PREFIX': 'superset_',
16 |     'CACHE_REDIS_HOST': env('REDIS_HOST'),
17 |     'CACHE_REDIS_PORT': env('REDIS_PORT', 6379),
18 |     'CACHE_REDIS_PASSWORD': env('REDIS_PASSWORD'),
19 |     'CACHE_REDIS_DB': env('REDIS_CACHE_DB', 1),
20 | }
21 | DATA_CACHE_CONFIG = CACHE_CONFIG
22 | EXPLORE_FORM_DATA_CACHE_CONFIG = CACHE_CONFIG
23 | FILTER_STATE_CACHE_CONFIG = CACHE_CONFIG
24 | 
25 | SQLALCHEMY_DATABASE_URI = f"postgresql+psycopg2://{env('POSTGRES_USER')}:{env('POSTGRES_PASSWORD')}@{env('POSTGRES_HOST')}:{env('POSTGRES_PORT', 5432)}/{env('POSTGRES_DB', 'superset')}"
26 | SQLALCHEMY_TRACK_MODIFICATIONS = True
27 | SECRET_KEY = env('SECRET_KEY')
28 | 
29 | # Flask-WTF flag for CSRF
30 | WTF_CSRF_ENABLED = True
31 | # Add endpoints that need to be exempt from CSRF protection
32 | WTF_CSRF_EXEMPT_LIST = []
33 | # A CSRF token that expires in 1 year
34 | WTF_CSRF_TIME_LIMIT = 60 * 60 * 24 * 365
35 | 
36 | class CeleryConfig(object):
37 |     BROKER_URL = f"redis://:{env('REDIS_PASSWORD')}@{env('REDIS_HOST')}:{env('REDIS_PORT', 6379)}/{env('REDIS_CELERY_DB', 0)}"
38 |     CELERY_IMPORTS = ('superset.sql_lab', )
39 |     CELERY_RESULT_BACKEND = f"redis://:{env('REDIS_PASSWORD')}@{env('REDIS_HOST')}:{env('REDIS_PORT', 6379)}/{env('REDIS_RESULTS_DB', 1)}"
40 |     CELERYD_LOG_LEVEL = "DEBUG"
41 |     CELERYD_PREFETCH_MULTIPLIER = 1
42 |     CELERY_ACKS_LATE = True
43 |     CELERY_ANNOTATIONS = {
44 |         'tasks.add': {
45 |             'rate_limit': '10/s'
46 |         },
47 |         'sql_lab.get_sql_results': {
48 |             'rate_limit': '100/s',
49 |         },
50 |         'email_reports.send': {
51 |             'rate_limit': '1/s',
52 |             'time_limit': 600,
53 |             'soft_time_limit': 600,
54 |             'ignore_result': True,
55 |         }
56 |     }
57 |     CELERYBEAT_SCHEDULE = {
58 |         'reports.scheduler': {
59 |             'task': 'reports.scheduler',
60 |             'schedule': crontab(minute='*', hour='*'),
61 |         },
62 |         'reports.prune_log': {
63 |             'task': 'reports.prune_log',
64 |             'schedule': crontab(minute=0, hour=0),
65 |         }
66 |     }
67 | CELERY_CONFIG = CeleryConfig
68 | 
69 | RESULTS_BACKEND = RedisCache(
70 |     host=env('REDIS_HOST'),
71 |     port=env('REDIS_PORT', 6379),
72 |     password=env('REDIS_PASSWORD'),
73 |     db=env('REDIS_CELERY_DB', 0),
74 |     key_prefix='superset_results'
75 | )
76 | 
77 | ####
78 | # User superset config begins here
79 | ####
80 | 


--------------------------------------------------------------------------------
/tf/redis/redis.tf:
--------------------------------------------------------------------------------
  1 | resource "random_password" "pass" {
  2 |   length = 32
  3 |   special = false
  4 | }
  5 | 
  6 | resource "kubernetes_secret" "redis" {
  7 |   metadata {
  8 |     labels = {
  9 |       app = var.name
 10 |     }
 11 |     name = var.name
 12 |     namespace = var.namespace
 13 |   }
 14 |   type = "Opaque"
 15 |   data = {
 16 |     pass = random_password.pass.result
 17 |   }
 18 | }
 19 | 
 20 | resource "kubernetes_service" "redis" {
 21 |   metadata {
 22 |     labels = {
 23 |       app = var.name
 24 |     }
 25 |     name = var.name
 26 |     namespace = var.namespace
 27 |   }
 28 |   spec {
 29 |     port {
 30 |       name = "redis"
 31 |       port = 6379
 32 |       target_port = "redis"
 33 |     }
 34 |     selector = {
 35 |       app = var.name
 36 |     }
 37 |     type = "ClusterIP"
 38 |   }
 39 | }
 40 | 
 41 | resource "kubernetes_stateful_set" "redis" {
 42 |   metadata {
 43 |     labels = {
 44 |       app = var.name
 45 |     }
 46 |     name = var.name
 47 |     namespace = var.namespace
 48 |   }
 49 |   spec {
 50 |     replicas = 1
 51 |     selector {
 52 |       match_labels = {
 53 |         app = var.name
 54 |       }
 55 |     }
 56 |     service_name = var.name
 57 |     template {
 58 |       metadata {
 59 |         labels = {
 60 |           app = var.name
 61 |         }
 62 |       }
 63 |       spec {
 64 |         container {
 65 |           command = [
 66 |             "/bin/sh",
 67 |             "-c",
 68 |             "redis-server --bind 0.0.0.0 --requirepass $REDIS_PASSWORD --loglevel $LOG_LEVEL --dir /data --maxmemory ${var.max_memory} --maxmemory-policy allkeys-lru --lazyfree-lazy-eviction yes --lazyfree-lazy-expire yes --io-threads 3",
 69 |           ]
 70 |           env {
 71 |             name = "LOG_LEVEL"
 72 |             value = "notice"
 73 |           }
 74 |           env {
 75 |             name = "REDIS_PASSWORD"
 76 |             value_from {
 77 |               secret_key_ref {
 78 |                 key = "pass"
 79 |                 name = kubernetes_secret.redis.metadata[0].name
 80 |               }
 81 |             }
 82 |           }
 83 |           image = var.image
 84 |           name = "server"
 85 |           port {
 86 |             container_port = 6379
 87 |             name = "redis"
 88 |           }
 89 |           startup_probe {
 90 |             initial_delay_seconds = 5
 91 |             period_seconds = 10
 92 |             tcp_socket {
 93 |               port = "redis"
 94 |             }
 95 |           }
 96 |           volume_mount {
 97 |             mount_path = "/data"
 98 |             name = "storage"
 99 |           }
100 |         }
101 |       }
102 |     }
103 |     volume_claim_template {
104 |       metadata {
105 |         name = "storage"
106 |       }
107 |       spec {
108 |         access_modes = [
109 |           "ReadWriteOnce",
110 |         ]
111 |         resources {
112 |           requests = {
113 |             storage = var.storage
114 |           }
115 |         }
116 |       }
117 |     }
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/tf/main.tf:
--------------------------------------------------------------------------------
  1 | provider "kubernetes" {
  2 |   config_path = "~/.kube/config"
  3 | }
  4 | 
  5 | provider "helm" {
  6 |   kubernetes {
  7 |     config_path = "~/.kube/config"
  8 |   }
  9 | }
 10 | 
 11 | terraform {
 12 |   required_providers {
 13 |     helm = {
 14 |       source = "hashicorp/helm"
 15 |       version = "2.17.0"
 16 |     }
 17 |     kubernetes = {
 18 |       source  = "hashicorp/kubernetes"
 19 |       version = "2.35.1"
 20 |     }
 21 |     random = {
 22 |       source = "hashicorp/random"
 23 |       version = "3.6.3"
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | module "namespace" {
 29 |   source = "./namespace"
 30 | 
 31 |   name = var.namespace
 32 | }
 33 | 
 34 | module "nessie_postgres" {
 35 |   count = var.nessie_postgres_internal ? 1 : 0
 36 |   source = "./postgres"
 37 | 
 38 |   namespace = module.namespace.name
 39 |   name = "nessie-postgres"
 40 |   image = var.image_postgres
 41 | 
 42 |   db = "nessie"
 43 |   storage = "1Gi"
 44 | }
 45 | 
 46 | resource "helm_release" "nessie" {
 47 |   name       = "nessie"
 48 |   namespace  = module.namespace.name
 49 |   repository = "https://charts.projectnessie.org/"
 50 |   chart      = "nessie"
 51 |   # latest from https://projectnessie.org/nessie-latest/
 52 |   version    = "0.102.2"
 53 | 
 54 |   values = [
 55 |     templatefile("${path.module}/nessie-values.yaml", {
 56 |       jdbc_url = var.nessie_postgres_internal ? "jdbc:postgresql://nessie-postgres:5432/nessie" : var.nessie_postgres_url
 57 |       jdbc_secret_name = var.nessie_postgres_internal ? "nessie-postgres" : ""
 58 |     })
 59 |   ]
 60 | }
 61 | 
 62 | module "minio" {
 63 |   source = "./minio"
 64 | 
 65 |   namespace = module.namespace.name
 66 |   name = "minio"
 67 |   image = var.image_minio
 68 | 
 69 |   username = var.minio_username
 70 |   password = var.minio_password
 71 |   replicas = var.minio_replicas
 72 |   mem_limit = var.minio_mem_limit
 73 | 
 74 |   node_selector = var.minio_node_selector
 75 |   tolerations = var.minio_tolerations
 76 | 
 77 |   storage = "1Gi"
 78 |   nfs_server = ""
 79 |   nfs_path = ""
 80 | }
 81 | 
 82 | resource "helm_release" "trino" {
 83 |   name       = "trino"
 84 |   namespace  = module.namespace.name
 85 |   repository = "https://trinodb.github.io/charts"
 86 |   chart      = "trino"
 87 |   # latest from https://github.com/trinodb/charts/
 88 |   version    = "1.36.0"
 89 | 
 90 |   values = [
 91 |     templatefile("${path.module}/trino-values.yaml", {
 92 |       catalogs = yamlencode(merge({
 93 |         iceberg = <<EOT
 94 | connector.name = iceberg-nessie
 95 | iceberg.catalog.type = nessie
 96 | iceberg.file-format = ORC
 97 | iceberg.nessie-catalog.uri = "http://nessie:19120/api/v2"
 98 | EOT
 99 |       }, var.trino_extra_catalogs))
100 | 
101 |       worker_replicas = var.trino_worker_replicas
102 | 
103 |       coordinator_node_selector   = var.trino_coordinator_node_selector
104 |       coordinator_mem_limit       = var.trino_coordinator_mem_limit
105 |       coordinator_max_heap        = var.trino_coordinator_max_heap
106 |       coordinator_query_mem_limit = var.trino_coordinator_query_mem_limit
107 | 
108 |       worker_node_selector   = var.trino_worker_node_selector
109 |       worker_mem_limit       = var.trino_worker_mem_limit
110 |       worker_max_heap        = var.trino_worker_max_heap
111 |       worker_query_mem_limit = var.trino_worker_query_mem_limit
112 |     })
113 |   ]
114 | }
115 | 
116 | module "devserver" {
117 |   count = var.devserver_enabled ? 1 : 0
118 |   source = "./devserver"
119 | 
120 |   namespace = module.namespace.name
121 | 
122 |   image = var.image_devserver
123 | 
124 |   minio_secret_name = module.minio.secret_name
125 | }
126 | 


--------------------------------------------------------------------------------
/tf/postgres/postgres.tf:
--------------------------------------------------------------------------------
  1 | resource "random_password" "user" {
  2 |   length = 32
  3 |   special = false
  4 | }
  5 | resource "random_password" "pass" {
  6 |   length = 32
  7 |   special = false
  8 | }
  9 | 
 10 | resource "kubernetes_secret" "postgres" {
 11 |   metadata {
 12 |     labels = {
 13 |       app = var.name
 14 |     }
 15 |     name = var.name
 16 |     namespace = var.namespace
 17 |   }
 18 |   type = "Opaque"
 19 |   data = {
 20 |     pass = random_password.pass.result
 21 |     user = random_password.user.result
 22 |   }
 23 | }
 24 | 
 25 | resource "kubernetes_service" "postgres" {
 26 |   metadata {
 27 |     labels = {
 28 |       app = var.name
 29 |     }
 30 |     name = var.name
 31 |     namespace = var.namespace
 32 |   }
 33 |   spec {
 34 |     port {
 35 |       name = "postgres"
 36 |       port = 5432
 37 |       target_port = "postgres"
 38 |     }
 39 |     selector = {
 40 |       app = var.name
 41 |     }
 42 |   }
 43 | }
 44 | 
 45 | resource "kubernetes_stateful_set" "postgres" {
 46 |   metadata {
 47 |     labels = {
 48 |       app = var.name
 49 |     }
 50 |     name = var.name
 51 |     namespace = var.namespace
 52 |   }
 53 |   spec {
 54 |     replicas = 1
 55 |     selector {
 56 |       match_labels = {
 57 |         app = var.name
 58 |       }
 59 |     }
 60 |     service_name = var.name
 61 |     template {
 62 |       metadata {
 63 |         labels = {
 64 |           app = var.name
 65 |         }
 66 |       }
 67 |       spec {
 68 |         container {
 69 |           command = [
 70 |             "bash",
 71 |             "-c",
 72 |             "mkdir -p $PGDATA && chown -R postgres:postgres $PGDATA && chmod 777 $PGDATA && /usr/local/bin/docker-entrypoint.sh postgres",
 73 |           ]
 74 |           env {
 75 |             name = "PGDATA"
 76 |             value = "/storage/postgres"
 77 |           }
 78 |           env {
 79 |             name = "POSTGRES_DB"
 80 |             value = var.db
 81 |           }
 82 |           env {
 83 |             name = "POSTGRES_USER"
 84 |             value_from {
 85 |               secret_key_ref {
 86 |                 key = "user"
 87 |                 name = kubernetes_secret.postgres.metadata[0].name
 88 |               }
 89 |             }
 90 |           }
 91 |           env {
 92 |             name = "POSTGRES_PASSWORD"
 93 |             value_from {
 94 |               secret_key_ref {
 95 |                 key = "pass"
 96 |                 name = kubernetes_secret.postgres.metadata[0].name
 97 |               }
 98 |             }
 99 |           }
100 |           image = var.image
101 |           name = "postgres"
102 |           port {
103 |             container_port = 5432
104 |             name = "postgres"
105 |           }
106 |           resources {
107 |             limits = {
108 |               hugepages-2Mi = "512Mi"
109 |               memory = "512Mi"
110 |             }
111 |           }
112 |           startup_probe {
113 |             exec {
114 |               command = [
115 |                 "/bin/bash",
116 |                 "-c",
117 |                 "pg_isready -q -d $POSTGRES_DB -U $POSTGRES_USER",
118 |               ]
119 |             }
120 |             failure_threshold = 60
121 |             initial_delay_seconds = 10
122 |             period_seconds = 5
123 |             timeout_seconds = 10
124 |           }
125 |           volume_mount {
126 |             mount_path = "/storage"
127 |             name = "storage"
128 |           }
129 |         }
130 |       }
131 |     }
132 |     volume_claim_template {
133 |       metadata {
134 |         name = "storage"
135 |       }
136 |       spec {
137 |         access_modes = [
138 |           "ReadWriteOnce",
139 |         ]
140 |         resources {
141 |           requests = {
142 |             storage = var.storage
143 |           }
144 |         }
145 |       }
146 |     }
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/tf/minio/minio.tf:
--------------------------------------------------------------------------------
  1 | resource "random_password" "admin_pass" {
  2 |   count = var.password == "" ? 1 : 0
  3 |   length = 32
  4 |   special = false
  5 | }
  6 | 
  7 | resource "kubernetes_secret" "minio" {
  8 |   metadata {
  9 |     labels = {
 10 |       app = var.name
 11 |     }
 12 |     name = var.name
 13 |     namespace = var.namespace
 14 |   }
 15 |   type = "Opaque"
 16 |   data = {
 17 |     pass = var.password == "" ? random_password.admin_pass[0].result : var.password
 18 |     user = sensitive(var.username)
 19 |   }
 20 | }
 21 | 
 22 | resource "kubernetes_service" "minio" {
 23 |   metadata {
 24 |     labels = {
 25 |       app = var.name
 26 |     }
 27 |     name = var.name
 28 |     namespace = var.namespace
 29 |   }
 30 |   spec {
 31 |     port {
 32 |       name = "s3"
 33 |       port = 9000
 34 |       target_port = "s3"
 35 |     }
 36 |     port {
 37 |       name = "console"
 38 |       port = 8080
 39 |       target_port = "console"
 40 |     }
 41 |     selector = {
 42 |       app = var.name
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | resource "kubernetes_service" "minio_headless" {
 48 |   metadata {
 49 |     labels = {
 50 |       app = var.name
 51 |     }
 52 |     name = "${var.name}-headless"
 53 |     namespace = var.namespace
 54 |   }
 55 |   spec {
 56 |     cluster_ip = "None"
 57 |     port {
 58 |       name = "s3"
 59 |       port = 9000
 60 |       target_port = "s3"
 61 |     }
 62 |     selector = {
 63 |       app = var.name
 64 |     }
 65 |   }
 66 | }
 67 | 
 68 | resource "kubernetes_stateful_set" "minio" {
 69 |   metadata {
 70 |     labels = {
 71 |       app = var.name
 72 |     }
 73 |     name = var.name
 74 |     namespace = var.namespace
 75 |   }
 76 |   spec {
 77 |     replicas = var.replicas
 78 |     selector {
 79 |       match_labels = {
 80 |         app = var.name
 81 |       }
 82 |     }
 83 |     service_name = "${var.name}-headless"
 84 |     template {
 85 |       metadata {
 86 |         labels = {
 87 |           app = var.name
 88 |         }
 89 |       }
 90 |       spec {
 91 |         affinity {
 92 |           pod_anti_affinity {
 93 |             required_during_scheduling_ignored_during_execution {
 94 |               label_selector {
 95 |                 match_expressions {
 96 |                   key = "app"
 97 |                   operator = "In"
 98 |                   values = [
 99 |                     var.name,
100 |                   ]
101 |                 }
102 |               }
103 |               topology_key = "kubernetes.io/hostname"
104 |             }
105 |           }
106 |         }
107 |         container {
108 |           command = [
109 |             "/bin/bash",
110 |             "-c",
111 |             # different arguments if we're running in unreplicated mode
112 |             var.replicas == 1
113 |             ? "minio server --console-address :8080 /data"
114 |             : "minio server --console-address :8080 http://${var.name}-{0...${var.replicas - 1}}.${var.name}-headless.${var.namespace}.svc.cluster.local:9000/data",
115 |           ]
116 |           env {
117 |             name = "MINIO_ROOT_USER"
118 |             value_from {
119 |               secret_key_ref {
120 |                 key = "user"
121 |                 name = kubernetes_secret.minio.metadata[0].name
122 |               }
123 |             }
124 |           }
125 |           env {
126 |             name = "MINIO_ROOT_PASSWORD"
127 |             value_from {
128 |               secret_key_ref {
129 |                 key = "pass"
130 |                 name = kubernetes_secret.minio.metadata[0].name
131 |               }
132 |             }
133 |           }
134 |           # Disable phoning home
135 |           env {
136 |             name = "MINIO_UPDATE"
137 |             value = "off"
138 |           }
139 |           image = var.image
140 |           name = "minio"
141 |           port {
142 |             container_port = 9000
143 |             name = "s3"
144 |           }
145 |           port {
146 |             container_port = 8080
147 |             name = "console"
148 |           }
149 |           resources {
150 |             limits = {
151 |               memory = var.mem_limit
152 |             }
153 |           }
154 |           volume_mount {
155 |             mount_path = "/data"
156 |             name = "storage"
157 |           }
158 |         }
159 |         node_selector = var.node_selector
160 |         dynamic "toleration" {
161 |           for_each = var.tolerations
162 |           content {
163 |             effect = toleration.value.effect
164 |             key = toleration.value.key
165 |             operator = toleration.value.operator
166 |             value = toleration.value.value
167 |           }
168 |         }
169 |         # Run containers as nobody:nogroup, or default as root when using nfs to avoid permission issues
170 |         dynamic "security_context" {
171 |           for_each = var.nfs_server == "" ? [0] : []
172 |           content {
173 |             fs_group = 65534
174 |             run_as_group = 65534
175 |             run_as_user = 65534
176 |           }
177 |         }
178 |         dynamic "volume" {
179 |           for_each = var.nfs_server == "" ? [] : [0]
180 |           content {
181 |             name = "storage"
182 |             nfs {
183 |               server = var.nfs_server
184 |               path = var.nfs_path
185 |             }
186 |           }
187 |         }
188 |       }
189 |     }
190 |     dynamic "volume_claim_template" {
191 |       for_each = var.storage == "" ? [] : [0]
192 |       content {
193 |         metadata {
194 |           name = "storage"
195 |         }
196 |         spec {
197 |           access_modes = [
198 |             "ReadWriteOnce",
199 |           ]
200 |           resources {
201 |             requests = {
202 |               storage = var.storage
203 |             }
204 |           }
205 |         }
206 |       }
207 |     }
208 |   }
209 | }
210 | 


--------------------------------------------------------------------------------
/tf/variables.tf:
--------------------------------------------------------------------------------
  1 | # These are the default settings for starting a new Walden instance.
  2 | # To customize these options:
  3 | # 1. Create a 'terraform.tfvars'
  4 | # 2. Add rows to 'terraform.tfvars' for any overrides: varname = varvalue
  5 | # 3. Deploy with 'tf apply'
  6 | 
  7 | variable "image_busybox" {
  8 |   type = string
  9 |   description = "Utility image for initContainers: config templating, waiting for dependencies to start."
 10 |   default = "docker.io/library/busybox:latest"
 11 | }
 12 | 
 13 | variable "image_minio" {
 14 |   type = string
 15 |   # https://hub.docker.com/r/minio/minio/tags
 16 |   default = "docker.io/minio/minio:RELEASE.2025-01-20T14-49-07Z"
 17 | }
 18 | 
 19 | variable "image_postgres" {
 20 |   type = string
 21 |   description = "Changes to the Postgres major version require manually upgrading the on-disk data."
 22 |   # https://hub.docker.com/_/postgres/tags
 23 |   default = "docker.io/library/postgres:17.2-bookworm"
 24 | }
 25 | 
 26 | variable "image_redis" {
 27 |   type = string
 28 |   # https://hub.docker.com/_/redis/tags
 29 |   # Sticking with the final BSD-licensed version
 30 |   default = "docker.io/library/redis:7.2.4-bookworm"
 31 | }
 32 | 
 33 | # The latest release versions for Walden images.
 34 | # See walden/docker/* for image definitions.
 35 | variable "image_devserver" {
 36 |   type = string
 37 |   default = "ghcr.io/scie-nz/walden:devserver-2023.01.08"
 38 | }
 39 | 
 40 | variable "image_superset" {
 41 |   type = string
 42 |   default = "ghcr.io/scie-nz/walden:superset-2023.01.08"
 43 | }
 44 | 
 45 | variable "namespace" {
 46 |   type = string
 47 |   default = "walden"
 48 |   description = "Kubernetes namespace where Walden should be deployed"
 49 | }
 50 | 
 51 | # DEVSERVER
 52 | 
 53 | variable "devserver_enabled" {
 54 |   type = bool
 55 |   default = true
 56 |   description = "Enables 'devserver' pod that's been preconfigured with access to Minio/Trino."
 57 | }
 58 | 
 59 | # MINIO
 60 | 
 61 | variable "minio_username" {
 62 |   type = string
 63 |   default = "walden"
 64 |   description = "The admin username for logging in to MinIO"
 65 | }
 66 | variable "minio_password" {
 67 |   type = string
 68 |   default = ""
 69 |   description = "The admin password for logging in to MinIO. If empty, a random value is generated and stored in the 'minio-admin' secret."
 70 | }
 71 | variable "minio_replicas" {
 72 |   type = number
 73 |   default = 4
 74 |   description = "The number of Minio replicas, must be at least four"
 75 | }
 76 | variable "minio_mem_limit" {
 77 |   type = string
 78 |   default = "512M"
 79 |   description = "The memory limit for each Minio pod. Minio recommends 8GB for pods with up to 1TB storage/pod, or 16GB for up to 10TB storage/pod. We start with very low values, increase to fit your system and workloads."
 80 | }
 81 | variable "minio_node_selector" {
 82 |   type = map
 83 |   default = {"kubernetes.io/arch" = "amd64"}
 84 | }
 85 | variable "minio_tolerations" {
 86 |   type = list(object({
 87 |     effect = string
 88 |     key = string
 89 |     operator = string
 90 |     value = string
 91 |   }))
 92 |   default = []
 93 | }
 94 | 
 95 | # SUPERSET
 96 | 
 97 | variable "superset_username" {
 98 |   type = string
 99 |   default = "walden"
100 |   description = "The admin username for logging in to Superset. If empty, a random value is generated and stored in the 'superset-admin' secret. This only takes effect during initial install. If you want to change it later, edit the 'superset-admin' Secret directly and restart the Superset pod."
101 | }
102 | variable "superset_password" {
103 |   type = string
104 |   default = ""
105 |   description = "The admin password for logging in to Superset. If empty, a random value is generated and stored in the 'superset-admin' secret. This only takes effect during initial install. If you want to change it later, edit the 'superset-admin' Secret directly and restart the Superset pod."
106 | }
107 | variable "superset_worker_replicas" {
108 |   type = number
109 |   default = 1
110 |   description = "Number of celery worker replicas."
111 | }
112 | variable "superset_mem_limit_server" {
113 |   type = string
114 |   default = "512M"
115 |   description = "The memory limits for each the Superset server pod. We start with very low values, increase to fit your system and workloads."
116 | }
117 | variable "superset_mem_limit_worker" {
118 |   type = string
119 |   default = "1Gi"
120 |   description = "The memory limits for each the Superset worker pod. We start with very low values, increase to fit your system and workloads."
121 | }
122 | 
123 | variable "superset_postgres_internal" {
124 |   type = bool
125 |   default = true
126 |   description = "By default Walden will deploy a basic internal instance, but you may instead wish to provide your own. If internal=False, you must manually create a Secret named 'superset-postgres' containing external 'user'/'pass' credentials: 'kubectl create secret generic superset-postgres -n walden --from-literal=user=FOO --from-literal=pass=BAR'"
127 | }
128 | variable "superset_postgres_host" {
129 |   type = string
130 |   default = "superset-postgres"
131 |   description = "Should only be customized if superset_postgres_internal is disabled"
132 | }
133 | variable "superset_postgres_port" {
134 |   type = number
135 |   default = 5432
136 |   description = "Should only be customized if superset_postgres_internal is disabled"
137 | }
138 | variable "superset_postgres_db" {
139 |   type = string
140 |   default = "superset"
141 |   description = "Should only be customized if superset_postgres_internal is disabled"
142 | }
143 | variable "superset_extra_datasources" {
144 |   type = string
145 |   default = ""
146 |   description = "Extra YAML content for superset_datasources.yaml containing other data sources to be preconfigured"
147 | }
148 | 
149 | variable "superset_scheduler_node_selector" {
150 |   type = map
151 |   default = {"kubernetes.io/arch" = "amd64"}
152 | }
153 | variable "superset_worker_node_selector" {
154 |   type = map
155 |   default = {"kubernetes.io/arch" = "amd64"}
156 | }
157 | variable "superset_app_node_selector" {
158 |   type = map
159 |   default = {"kubernetes.io/arch" = "amd64"}
160 | }
161 | variable "superset_scheduler_tolerations" {
162 |   type = list(object({
163 |     effect = string
164 |     key = string
165 |     operator = string
166 |     value = string
167 |   }))
168 |   default = []
169 | }
170 | variable "superset_worker_tolerations" {
171 |   type = list(object({
172 |     effect = string
173 |     key = string
174 |     operator = string
175 |     value = string
176 |   }))
177 |   default = []
178 | }
179 | variable "superset_app_tolerations" {
180 |   type = list(object({
181 |     effect = string
182 |     key = string
183 |     operator = string
184 |     value = string
185 |   }))
186 |   default = []
187 | }
188 | 
189 | # NESSIE
190 | 
191 | variable "nessie_postgres_internal" {
192 |   type = bool
193 |   default = true
194 |   description = "By default Walden will deploy a basic internal instance, but you may instead wish to provide your own. If internal=False, you must configure nessie_postgres_url with the JDBC URL for connecting to your instance."
195 | }
196 | variable "nessie_postgres_url" {
197 |   type = string
198 |   default = ""
199 |   description = "Should only be configured if nessie_postgres_internal is disabled"
200 | }
201 | 
202 | # TRINO
203 | 
204 | variable "trino_worker_replicas" {
205 |   type = number
206 |   default = 1
207 |   description = "Number of Trino worker instances"
208 | }
209 | variable "trino_coordinator_mem_limit" {
210 |   type = string
211 |   default = "3Gi"
212 |   description = "The memory limits for the Trino coordinator pod. We start with very low values, increase to fit your system and workloads."
213 | }
214 | variable "trino_worker_mem_limit" {
215 |   type = string
216 |   default = "3Gi"
217 |   description = "The memory limits for the Trino coordinator pod. We start with very low values, increase to fit your system and workloads."
218 | }
219 | variable "trino_coordinator_max_heap" {
220 |   type = string
221 |   default = "2G"
222 |   description = "Amount of memory to allocate to heap, e.g. 30% of trino_coordinator_mem_limit. If this is too high then workers may be OOMKilled"
223 | }
224 | variable "trino_worker_max_heap" {
225 |   type = string
226 |   default = "2G"
227 |   description = "Amount of memory to allocate to heap, e.g. 30% of trino_worker_mem_limit. If this is too high then workers may be OOMKilled"
228 | }
229 | variable "trino_coordinator_query_mem_limit" {
230 |   type = string
231 |   default = "1GB"
232 | }
233 | variable "trino_worker_query_mem_limit" {
234 |   type = string
235 |   default = "1GB"
236 | }
237 | 
238 | variable "trino_extra_catalogs" {
239 |   type = map
240 |   default = {
241 |     tpcds = <<EOT
242 | connector.name = tpcds
243 | tpcds.splits-per-node = 4
244 | EOT
245 |     tpch = <<EOT
246 | connector.name=tpch
247 | tpch.splits-per-node=4
248 | EOT
249 |   }
250 |   description = "Additional catalog files (filename => content) to provide to Trino"
251 | }
252 | 
253 | variable "trino_coordinator_node_selector" {
254 |   type = string
255 |   default = "{\"kubernetes.io/arch\": \"amd64\"}"
256 | }
257 | variable "trino_worker_node_selector" {
258 |   type = string
259 |   default = "{\"kubernetes.io/arch\": \"amd64\"}"
260 | }
261 | 


--------------------------------------------------------------------------------
/tf/superset/superset.tf:
--------------------------------------------------------------------------------
  1 | resource "random_password" "key" {
  2 |   length = 32
  3 |   special = false
  4 | }
  5 | 
  6 | resource "random_password" "admin_pass" {
  7 |   count = var.password == "" ? 1 : 0
  8 |   length = 32
  9 |   special = false
 10 | }
 11 | 
 12 | resource "kubernetes_secret" "key" {
 13 |   metadata {
 14 |     labels = {
 15 |       app = "superset"
 16 |     }
 17 |     name = "superset-key"
 18 |     namespace = var.namespace
 19 |   }
 20 |   type = "Opaque"
 21 |   data = {
 22 |     key = random_password.key.result
 23 |   }
 24 | }
 25 | 
 26 | resource "kubernetes_secret" "admin" {
 27 |   metadata {
 28 |     labels = {
 29 |       app = "superset"
 30 |     }
 31 |     name = "superset-admin"
 32 |     namespace = var.namespace
 33 |   }
 34 |   type = "Opaque"
 35 |   data = {
 36 |     pass = var.password == "" ? random_password.admin_pass[0].result : var.password
 37 |     user = var.username
 38 |   }
 39 | }
 40 | 
 41 | resource "kubernetes_config_map" "superset" {
 42 |   metadata {
 43 |     labels = {
 44 |       app = "superset"
 45 |     }
 46 |     name = "superset"
 47 |     namespace = var.namespace
 48 |   }
 49 |   data = {
 50 |     "superset_config.py" = file("${path.module}/superset_config.py")
 51 | 
 52 |     "superset_datasources.yaml" = <<-EOT
 53 | databases:
 54 | - database_name: trino-hive
 55 |   allow_ctas: true
 56 |   allow_cvas: true
 57 |   allow_dml: true
 58 |   allow_multi_schema_metadata_fetch: true
 59 |   allow_run_async: true
 60 |   impersonate_user: true
 61 |   extra: '{"cost_estimate_enabled":true,"allows_virtual_table_explore":true,"metadata_params":{},"engine_params":{},"schemas_allowed_for_csv_upload":[]}'
 62 |   sqlalchemy_uri: trino://trino:80/hive
 63 | - database_name: trino-system
 64 |   allow_multi_schema_metadata_fetch: true
 65 |   allow_run_async: true
 66 |   impersonate_user: true
 67 |   extra: '{"cost_estimate_enabled":true,"allows_virtual_table_explore":true,"metadata_params":{},"engine_params":{},"schemas_allowed_for_csv_upload":[]}'
 68 |   sqlalchemy_uri: trino://trino:80
 69 | ${var.extra_datasources}
 70 | EOT
 71 | 
 72 |     "superset_init.sh" = file("${path.module}/superset_init.sh")
 73 |   }
 74 | }
 75 | 
 76 | resource "kubernetes_service" "superset" {
 77 |   metadata {
 78 |     labels = {
 79 |       app = "superset"
 80 |     }
 81 |     name = "superset"
 82 |     namespace = var.namespace
 83 |   }
 84 |   spec {
 85 |     port {
 86 |       name = "http"
 87 |       port = 80
 88 |       target_port = "http"
 89 |     }
 90 |     selector = {
 91 |       app = "superset"
 92 |     }
 93 |   }
 94 | }
 95 | 
 96 | resource "kubernetes_deployment" "scheduler" {
 97 |   metadata {
 98 |     labels = {
 99 |       app = "superset-scheduler"
100 |     }
101 |     name = "superset-scheduler"
102 |     namespace = var.namespace
103 |   }
104 |   spec {
105 |     replicas = 1
106 |     selector {
107 |       match_labels = {
108 |         app = "superset-scheduler"
109 |       }
110 |     }
111 |     template {
112 |       metadata {
113 |         labels = {
114 |           app = "superset-scheduler"
115 |         }
116 |       }
117 |       spec {
118 |         container {
119 |           command = [
120 |             "celery",
121 |             "--pidfile=",
122 |             "--schedule=/tmp/celerybeat-schedule",
123 |             "--app=superset.tasks.celery_app:app",
124 |             "beat",
125 |           ]
126 |           env {
127 |             name = "SECRET_KEY"
128 |             value_from {
129 |               secret_key_ref {
130 |                 key = "key"
131 |                 name = kubernetes_secret.key.metadata[0].name
132 |               }
133 |             }
134 |           }
135 |           env {
136 |             name = "REDIS_HOST"
137 |             value = var.redis_host
138 |           }
139 |           env {
140 |             name = "REDIS_PASSWORD"
141 |             value_from {
142 |               secret_key_ref {
143 |                 key = "pass"
144 |                 name = var.redis_secret_name
145 |               }
146 |             }
147 |           }
148 |           env {
149 |             name = "POSTGRES_HOST"
150 |             value = var.postgres_host
151 |           }
152 |           env {
153 |             name = "POSTGRES_PORT"
154 |             value = var.postgres_port
155 |           }
156 |           env {
157 |             name = "POSTGRES_DB"
158 |             value = var.postgres_db
159 |           }
160 |           env {
161 |             name = "POSTGRES_USER"
162 |             value_from {
163 |               secret_key_ref {
164 |                 key = "user"
165 |                 name = var.postgres_secret_name
166 |               }
167 |             }
168 |           }
169 |           env {
170 |             name = "POSTGRES_PASSWORD"
171 |             value_from {
172 |               secret_key_ref {
173 |                 key = "pass"
174 |                 name = var.postgres_secret_name
175 |               }
176 |             }
177 |           }
178 |           image = var.image_superset
179 |           name = "superset"
180 |           resources {
181 |             limits = {
182 |               memory = "256Mi"
183 |             }
184 |           }
185 |           volume_mount {
186 |             mount_path = "/app/pythonpath"
187 |             name = "config"
188 |             read_only = true
189 |           }
190 |         }
191 |         init_container {
192 |           command = [
193 |             "/bin/sh",
194 |             "-c",
195 |             "until nc -zv $POSTGRES_HOST $POSTGRES_PORT -w1; do echo waiting for postgres: $POSTGRES_HOST:$POSTGRES_PORT; sleep 1; done",
196 |           ]
197 |           env {
198 |             name = "POSTGRES_HOST"
199 |             value = var.postgres_host
200 |           }
201 |           env {
202 |             name = "POSTGRES_PORT"
203 |             value = var.postgres_port
204 |           }
205 |           image = var.image_busybox
206 |           name = "wait-for-postgres"
207 |         }
208 |         node_selector = var.scheduler_node_selector
209 |         dynamic "toleration" {
210 |           for_each = var.scheduler_tolerations
211 |           content {
212 |             effect = toleration.value.effect
213 |             key = toleration.value.key
214 |             operator = toleration.value.operator
215 |             value = toleration.value.value
216 |           }
217 |         }
218 |         volume {
219 |           config_map {
220 |             name = kubernetes_config_map.superset.metadata[0].name
221 |           }
222 |           name = "config"
223 |         }
224 |       }
225 |     }
226 |   }
227 | }
228 | 
229 | resource "kubernetes_deployment" "worker" {
230 |   metadata {
231 |     labels = {
232 |       app = "superset-worker"
233 |     }
234 |     name = "superset-worker"
235 |     namespace = var.namespace
236 |   }
237 |   spec {
238 |     replicas = 1
239 |     selector {
240 |       match_labels = {
241 |         app = "superset-worker"
242 |       }
243 |     }
244 |     template {
245 |       metadata {
246 |         labels = {
247 |           app = "superset-worker"
248 |         }
249 |       }
250 |       spec {
251 |         container {
252 |           command = [
253 |             "celery",
254 |             "--app=superset.tasks.celery_app:app",
255 |             "worker",
256 |             "--pool=prefork",
257 |             "--max-tasks-per-child=128",
258 |           ]
259 |           env {
260 |             name = "SECRET_KEY"
261 |             value_from {
262 |               secret_key_ref {
263 |                 key = "key"
264 |                 name = kubernetes_secret.key.metadata[0].name
265 |               }
266 |             }
267 |           }
268 |           env {
269 |             name = "REDIS_HOST"
270 |             value = var.redis_host
271 |           }
272 |           env {
273 |             name = "REDIS_PASSWORD"
274 |             value_from {
275 |               secret_key_ref {
276 |                 key = "pass"
277 |                 name = var.redis_secret_name
278 |               }
279 |             }
280 |           }
281 |           env {
282 |             name = "POSTGRES_HOST"
283 |             value = var.postgres_host
284 |           }
285 |           env {
286 |             name = "POSTGRES_PORT"
287 |             value = var.postgres_port
288 |           }
289 |           env {
290 |             name = "POSTGRES_DB"
291 |             value = var.postgres_db
292 |           }
293 |           env {
294 |             name = "POSTGRES_USER"
295 |             value_from {
296 |               secret_key_ref {
297 |                 key = "user"
298 |                 name = var.postgres_secret_name
299 |               }
300 |             }
301 |           }
302 |           env {
303 |             name = "POSTGRES_PASSWORD"
304 |             value_from {
305 |               secret_key_ref {
306 |                 key = "pass"
307 |                 name = var.postgres_secret_name
308 |               }
309 |             }
310 |           }
311 |           image = var.image_superset
312 |           name = "superset"
313 |           resources {
314 |             limits = {
315 |               memory = var.mem_limit_worker
316 |             }
317 |           }
318 |           volume_mount {
319 |             mount_path = "/app/pythonpath"
320 |             name = "config"
321 |             read_only = true
322 |           }
323 |         }
324 |         init_container {
325 |           command = [
326 |             "/bin/sh",
327 |             "-c",
328 |             "until nc -zv $POSTGRES_HOST $POSTGRES_PORT -w1; do echo waiting for postgres: $POSTGRES_HOST:$POSTGRES_PORT; sleep 1; done",
329 |           ]
330 |           env {
331 |             name = "POSTGRES_HOST"
332 |             value = var.postgres_host
333 |           }
334 |           env {
335 |             name = "POSTGRES_PORT"
336 |             value = var.postgres_port
337 |           }
338 |           image = var.image_busybox
339 |           name = "wait-for-postgres"
340 |         }
341 |         node_selector = var.worker_node_selector
342 |         dynamic "toleration" {
343 |           for_each = var.worker_tolerations
344 |           content {
345 |             effect = toleration.value.effect
346 |             key = toleration.value.key
347 |             operator = toleration.value.operator
348 |             value = toleration.value.value
349 |           }
350 |         }
351 |         volume {
352 |           config_map {
353 |             name = kubernetes_config_map.superset.metadata[0].name
354 |           }
355 |           name = "config"
356 |         }
357 |       }
358 |     }
359 |   }
360 | }
361 | 
362 | resource "kubernetes_deployment" "superset" {
363 |   metadata {
364 |     labels = {
365 |       app = "superset"
366 |     }
367 |     name = "superset"
368 |     namespace = var.namespace
369 |   }
370 |   spec {
371 |     replicas = 1
372 |     selector {
373 |       match_labels = {
374 |         app = "superset"
375 |       }
376 |     }
377 |     template {
378 |       metadata {
379 |         labels = {
380 |           app = "superset"
381 |         }
382 |       }
383 |       spec {
384 |         container {
385 |           command = [
386 |             "/bin/sh",
387 |             "-c",
388 |             <<-EOT
389 | sed -i s/ctrl+x/alt+x/ /app/superset/static/assets/sqllab.*.js \
390 | && gunicorn \
391 |     --bind 0.0.0.0:8088 \
392 |     --access-logfile - \
393 |     --error-logfile - \
394 |     --workers 1 \
395 |     --worker-class gthread \
396 |     --threads 20 \
397 |     --timeout 60 \
398 |     --limit-request-line 0 \
399 |     --limit-request-field_size 0 \
400 |     "superset.app:create_app()"
401 | EOT
402 |             ,
403 |           ]
404 |           env {
405 |             name = "SECRET_KEY"
406 |             value_from {
407 |               secret_key_ref {
408 |                 key = "key"
409 |                 name = kubernetes_secret.key.metadata[0].name
410 |               }
411 |             }
412 |           }
413 |           env {
414 |             name = "REDIS_HOST"
415 |             value = var.redis_host
416 |           }
417 |           env {
418 |             name = "REDIS_PASSWORD"
419 |             value_from {
420 |               secret_key_ref {
421 |                 key = "pass"
422 |                 name = var.redis_secret_name
423 |               }
424 |             }
425 |           }
426 |           env {
427 |             name = "POSTGRES_HOST"
428 |             value = var.postgres_host
429 |           }
430 |           env {
431 |             name = "POSTGRES_PORT"
432 |             value = var.postgres_port
433 |           }
434 |           env {
435 |             name = "POSTGRES_DB"
436 |             value = var.postgres_db
437 |           }
438 |           env {
439 |             name = "POSTGRES_USER"
440 |             value_from {
441 |               secret_key_ref {
442 |                 key = "user"
443 |                 name = var.postgres_secret_name
444 |               }
445 |             }
446 |           }
447 |           env {
448 |             name = "POSTGRES_PASSWORD"
449 |             value_from {
450 |               secret_key_ref {
451 |                 key = "pass"
452 |                 name = var.postgres_secret_name
453 |               }
454 |             }
455 |           }
456 |           image = var.image_superset
457 |           name = "superset"
458 |           port {
459 |             container_port = 8088
460 |             name = "http"
461 |           }
462 |           resources {
463 |             limits = {
464 |               memory = var.mem_limit_server
465 |             }
466 |           }
467 |           volume_mount {
468 |             mount_path = "/app/pythonpath"
469 |             name = "config"
470 |             read_only = true
471 |           }
472 |         }
473 |         container {
474 |           command = [
475 |             "/bin/sh",
476 |             "-c",
477 |             <<-EOT
478 | . /app/pythonpath/superset_init.sh;
479 | echo 'Done, sleeping forever';
480 | touch /tmp/ready;
481 | sleep infinity
482 | EOT
483 |             ,
484 |           ]
485 |           env {
486 |             name = "ADMIN_USER"
487 |             value_from {
488 |               secret_key_ref {
489 |                 key = "user"
490 |                 name = kubernetes_secret.admin.metadata[0].name
491 |               }
492 |             }
493 |           }
494 |           env {
495 |             name = "ADMIN_PASS"
496 |             value_from {
497 |               secret_key_ref {
498 |                 key = "pass"
499 |                 name = kubernetes_secret.admin.metadata[0].name
500 |               }
501 |             }
502 |           }
503 |           env {
504 |             name = "SECRET_KEY"
505 |             value_from {
506 |               secret_key_ref {
507 |                 key = "key"
508 |                 name = kubernetes_secret.key.metadata[0].name
509 |               }
510 |             }
511 |           }
512 |           env {
513 |             name = "REDIS_HOST"
514 |             value = var.redis_host
515 |           }
516 |           env {
517 |             name = "REDIS_PASSWORD"
518 |             value_from {
519 |               secret_key_ref {
520 |                 key = "pass"
521 |                 name = var.redis_secret_name
522 |               }
523 |             }
524 |           }
525 |           env {
526 |             name = "POSTGRES_HOST"
527 |             value = var.postgres_host
528 |           }
529 |           env {
530 |             name = "POSTGRES_PORT"
531 |             value = var.postgres_port
532 |           }
533 |           env {
534 |             name = "POSTGRES_DB"
535 |             value = var.postgres_db
536 |           }
537 |           env {
538 |             name = "POSTGRES_USER"
539 |             value_from {
540 |               secret_key_ref {
541 |                 key = "user"
542 |                 name = var.postgres_secret_name
543 |               }
544 |             }
545 |           }
546 |           env {
547 |             name = "POSTGRES_PASSWORD"
548 |             value_from {
549 |               secret_key_ref {
550 |                 key = "pass"
551 |                 name = var.postgres_secret_name
552 |               }
553 |             }
554 |           }
555 |           image = var.image_superset
556 |           name = "init"
557 |           startup_probe {
558 |             exec {
559 |               command = [
560 |                 "/bin/sh",
561 |                 "-c",
562 |                 "if [ ! -f /tmp/ready ]; then exit 1; fi",
563 |               ]
564 |             }
565 |             initial_delay_seconds = 10
566 |             period_seconds = 30
567 |             timeout_seconds = 90
568 |           }
569 |           volume_mount {
570 |             mount_path = "/app/pythonpath"
571 |             name = "config"
572 |             read_only = true
573 |           }
574 |         }
575 |         init_container {
576 |           command = [
577 |             "/bin/sh",
578 |             "-c",
579 |             "until nc -zv $POSTGRES_HOST $POSTGRES_PORT -w1; do echo waiting for postgres: $POSTGRES_HOST:$POSTGRES_PORT; sleep 1; done",
580 |           ]
581 |           env {
582 |             name = "POSTGRES_HOST"
583 |             value = var.postgres_host
584 |           }
585 |           env {
586 |             name = "POSTGRES_PORT"
587 |             value = var.postgres_port
588 |           }
589 |           image = var.image_busybox
590 |           name = "wait-for-postgres"
591 |         }
592 |         node_selector = var.app_node_selector
593 |         dynamic "toleration" {
594 |           for_each = var.app_tolerations
595 |           content {
596 |             effect = toleration.value.effect
597 |             key = toleration.value.key
598 |             operator = toleration.value.operator
599 |             value = toleration.value.value
600 |           }
601 |         }
602 |         volume {
603 |           config_map {
604 |             name = kubernetes_config_map.superset.metadata[0].name
605 |           }
606 |           name = "config"
607 |         }
608 |       }
609 |     }
610 |   }
611 | }
612 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Walden
  2 | 
  3 | Walden is a small data lake meant for solitary use. Read more about it on [our website](https://scie.nz/walden).
  4 | 
  5 | ![Component diagram: Superset, Trino, Hive Metastore, MinIO](components.png)
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | There are a few things you need before you create your Walden deployment:
 10 | 
 11 | - You need a Linux environment from which to run the code. The code was tested against Ubuntu 20.04 LTS as well as Arch Linux.
 12 | - You need a Kubernetes Cluster. If you don't know what that is, check out [K3s](https://k3s.io/).
 13 | - Your Kubernetes cluster needs to have at least 4 nodes (regular PCs are fine), with at least 6GB of RAM. We run this on 4 machines, each with 16 GB RAM. It works.
 14 | - You need to install [Terraform](https://developer.hashicorp.com/terraform/downloads), a tool for deploying things. This is used to manage the deployment of Walden into your cluster.
 15 | 
 16 | ## Running Walden
 17 | 
 18 | ### Deploy Walden
 19 | 
 20 | ```
 21 | git clone https://github.com/scie-nz/walden
 22 | cd walden/tf
 23 | 
 24 | # requires Terraform and kubectl access to cluster:
 25 | tf apply
 26 | ```
 27 | 
 28 | You should see a whole bunch of text resulting from the deploy command. As
 29 | long as no obvious errors show up, that's expected.
 30 | 
 31 | To check the health of your cluster run:
 32 | ```
 33 | kubectl get pods -n walden
 34 | ```
 35 | 
 36 | (If you're using k3s locally, preface this command like so:
 37 |  `k3s kubectl get pods -n walden`)
 38 | 
 39 | A healthy deployment looks like this:
 40 | ```
 41 | NAME                               READY   STATUS    RESTARTS   AGE
 42 | devserver-65d668b5c6-s62m7         1/1     Running   0          49s
 43 | metastore-8696bf6b46-455qx         1/1     Running   0          49s
 44 | superset-worker-69cd7c966-pwsp8    1/1     Running   0          49s
 45 | superset-postgres-0                1/1     Running   0          48s
 46 | metastore-postgres-0               1/1     Running   0          48s
 47 | superset-66fb584c7d-wc8c4          2/2     Running   0          48s
 48 | trino-coordinator-8c6bc455-w2qdw   1/1     Running   0          48s
 49 | trino-worker-86d9484f75-7j8cw      1/1     Running   0          47s
 50 | superset-redis-0                   1/1     Running   0          47s
 51 | minio-0                            1/1     Running   0          47s
 52 | minio-1                            1/1     Running   0          36s
 53 | minio-2                            1/1     Running   0          27s
 54 | minio-3                            1/1     Running   0          21s
 55 | ```
 56 | 
 57 | By default, Walden expects your cluster to have at least four machines/nodes.
 58 | If you don't have four nodes, you may see MinIO pods that are stuck in `Pending`.
 59 | You can get things unstuck by manually editing the `minio` StatefulSet and removing
 60 | the `affinity` block, allowing multiple MinIO pods to deploy on the same machine:
 61 | ```
 62 | # Edit MinIO StatefulSet, remove affinity block
 63 | $ kubectl edit statefulset -n walden minio
 64 | ```
 65 | 
 66 | If something else has gone wrong, `kubectl logs -n walden [name of pod]` should help
 67 | most of the time. If you need to do more debugging because something is failing
 68 | but are new to Kubernetes, about now would be a good time to go through
 69 | a [tutorial](https://kubernetes.io/docs/tutorials/kubernetes-basics/).
 70 | 
 71 | ### Use devserver to access Trino CLI
 72 | 
 73 | Assuming the deployment succeeded, you can ssh into the `devserver` pod like so:
 74 | ```
 75 | $ kubectl exec -it -n walden deployment/devserver -- /bin/bash
 76 | ```
 77 | 
 78 | The following steps are performed from within the `devserver` pod.
 79 | 
 80 | #### Create a MinIO bucket
 81 | 
 82 | Now that you are logged in to the devserver, you are ready to interact with your glorious data pond!
 83 | 
 84 | Let's start by creating a bucket in MinIO, the object store service that's included in Walden. We will store data in these buckets:
 85 | ```
 86 | devserver# mc alias set walden-minio/ http://minio:9000 $MINIO_ACCESS_KEY_ID $MINIO_ACCESS_KEY_SECRET
 87 | Added `walden-minio` successfully.
 88 | 
 89 | devserver# mc mb walden-minio/direct
 90 | Bucket created successfully: `walden-minio/direct`
 91 | ```
 92 | 
 93 | Note -- `walden-minio` is an alias to the MinIO deployment created automatically when we start the devserver. We have created a bucket called "direct".
 94 | 
 95 | #### Connect Trino to MinIO directly
 96 | 
 97 | First, run the following command from the devserver shell. This starts a `trino-cli` session with the `direct` schema against the `hive` data storage provided by Walden.
 98 | ```
 99 | devserver# trino direct
100 | ```
101 | 
102 | We will use this schema to talk directly to the MinIO storage. The `direct` schema does not actually exist in the Hive metastore yet, so we need to create it:
103 | ```
104 | trino:direct> CREATE SCHEMA IF NOT EXISTS direct WITH (location='s3a://direct/');
105 | CREATE SCHEMA
106 | ```
107 | 
108 | If you run `SHOW SCHEMAS` you should see:
109 | ```
110 | trino:direct> SHOW SCHEMAS;
111 |        Schema
112 | --------------------
113 |  default
114 |  direct
115 |  information_schema
116 | (3 rows)
117 | ```
118 | 
119 | Now we can create a table and store some data:
120 | ```
121 | trino:direct> CREATE TABLE IF NOT EXISTS dim_foo(key VARCHAR, val BIGINT);
122 | CREATE TABLE
123 | 
124 | trino:direct> INSERT INTO dim_foo VALUES ('this', 1), ('is', 2), ('a', 3), ('test', 4);
125 | INSERT: 4 rows
126 | ```
127 | 
128 | Assuming everything is working, you should be able to query the stored values:
129 | ```
130 | trino:direct> SELECT key, val FROM dim_foo;
131 |  key  | val
132 | ------+-----
133 |  this |   1
134 |  is   |   2
135 |  a    |   3
136 |  test |   4
137 | (4 rows)
138 | ```
139 | 
140 | Now we can press `Ctrl+D` to exit the Trino console session, and look at the files created in the underlying MinIO bucket we created earlier:
141 | ```
142 | trino:direct> ^D
143 | 
144 | devserver# mc ls -r walden-minio/direct
145 | [2022-03-11 06:22:24 UTC]     0B STANDARD dim_foo/
146 | [2022-03-11 06:21:42 UTC]   356B STANDARD 20220311_062141_00005_26e8n_9d96d247-6da3-49f9-a537-b0bc897879b9
147 | ```
148 | 
149 | We can clean up our test data by deleting the table and then the schema:
150 | ```
151 | devserver# trino direct
152 | 
153 | trino:direct> DROP TABLE dim_foo;
154 | DROP TABLE
155 | trino:direct> DROP SCHEMA direct;
156 | DROP SCHEMA
157 | trino:direct> ^D
158 | 
159 | devserver# mc ls walden-minio/direct
160 | <empty>
161 | ```
162 | 
163 | ### Explore data with Superset
164 | 
165 | Superset provides a convienient UI for exploring the data that you've stored in Trino.
166 | Walden includes an instance of Superset that's preconfigured to connect to Trino.
167 | 
168 | Superset has been configured with a `walden` user and a randomly generated password.
169 | 
170 | Get the password for logging into Superset:
171 | ```
172 | $ kubectl get secret -n walden superset-admin -o 'jsonpath={.data.pass}' | base64 -d && echo
173 | lONGpASSWoRD64HERE
174 | ```
175 | 
176 | Set up a port-forward to access Superset on port `8088`:
177 | ```
178 | $ kubectl port-forward -n walden deployment/superset 8088
179 | ```
180 | 
181 | Go to [http://127.0.0.1:8088/](`http://127.0.0.1:8088/`) and log in with user=`walden` and the password you got earlier.
182 | 
183 | Two Trino databases should have been automatically added to Superset by Walden:
184 | - `walden-trino` has Trino data, including what we added to a `test` schema in earlier steps. The data itself is being stored to Minio in Hive columnar table format, with the Metastore acting as the index.
185 | - `walden-trino-system` has various Trino System statistics like node connectivity that may also be interesting to explore.
186 | In addition to these autoconfigured, you should be able to add other external databases to Superset as well via `Data` > `Databases` in the top menu.
187 | 
188 | We can start exploring by going to `SQL Lab` > `SQL Editor` in the top menu.
189 | In the SQL Editor view, select the `walden-trino` database, then the `test` schema that we created earlier.
190 | The values we added to this schema earlier should also be visible via Superset.
191 | 
192 | ![Screenshot of Superset UI showing test data](superset.png)
193 | 
194 | ### View the Trino Dashboard
195 | 
196 | Trino also provides a dashboard UI showing basic activity. Set up a port-forward to access the dashboard on port `8080`:
197 | ```
198 | $ kubectl port-forward -n walden deployment/trino-coordinator 8080:80
199 | ```
200 | 
201 | Go to [http://127.0.0.1:8080](`http://127.0.0.1:8080`) and log in with any value for the username (it doesn't matter).
202 | 
203 | ![Screenshot of Trino Dashboard](trino.png)
204 | 
205 | ## Conclusions
206 | 
207 | That's it, this is an easy way to get a small data lake working.
208 | This is meant to be a fully functional starting point that can be expanded and customized to fit your needs.
209 | Everything here is provided as-is, so your mileage may vary.
210 | Please report any bugs or issues and we will try to get to them.
211 | 
212 | ## Cloud Provider Installation
213 | 
214 | Walden can be used either on-premise or in hosted Kubernetes environments.
215 | Here are some example steps for setting up Walden in various cloud providers.
216 | 
217 | ### AWS
218 | 
219 | This tutorial assumes you already have an AWS account set up. Instructions should be run from either Mac or Linux machines. Also keep in mind this is likely to cost a few dollars in AWS costs to try out. We've been able to keep costs below $5 USD when running a minimal cluster for a short amount of time.
220 | 
221 | #### Configure AWS EKS Admin
222 | 
223 | To manage the EKS cluster programmatically from your local machine, you will need to create a new AWS IAM user and grant it appropriate permissions.
224 | 
225 | The easiest way to do so reproducibly involves using a "CloudShell" session. To bring one up, search for "CloudShell" in your AWS console. Note that you should be logged in with your AWS root account when running these operations.
226 | 
227 | 1. Create IAM user
228 | 
229 |     Run the following in your AWS cloud shell session
230 |     ```
231 |     aws iam create-user --user-name eksadmin
232 |     ```
233 |     **Note**: you can skip this step if you already have an IAM user you would like to use. Simply replace `eksadmin` with your user name where necessary.
234 | 
235 | 2. Create IAM policies
236 | 
237 |     To be able to spin up an EKS cluster via `eksctl` we will need to define two new policies, with which we will associate our EKS admin user. The policy definitions are available in policy documents stored in this repositories. To access them, clone the repository first (from your AWS cloud shell session):
238 | 
239 |     ```
240 |     git clone https://github.com/scie-nz/walden.git
241 |     cd walden
242 |     ```
243 | 
244 |     We will preserve the account identifier for future use:
245 | 
246 |     ```
247 |     export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
248 |     echo $AWS_ACCOUNT_ID
249 |     ```
250 | 
251 |     We can now render the policies, i.e. by substituting the `AWS_ACCOUNT_ID` we just captured:
252 |     ```
253 |     cat aws/eks-all-access.json | envsubst > eks-all-access-rendered.json
254 |     cat aws/iam-limited-access.json | envsubst > iam-limited-access-rendered.json
255 |     ```
256 | 
257 |     Then we will create our policies:
258 |     ```
259 |     aws iam create-policy --policy-name EKSAllAccess --policy-document file://eks-all-access-rendered.json
260 |     aws iam create-policy --policy-name IAMLimitedAccess --policy-document file://iam-limited-access-rendered.json
261 |     ```
262 | 
263 | 3. Create IAM EKS Admin group
264 | 
265 |     First we create our group and add our EKS admin user to it:
266 | 
267 |     ```
268 |     aws iam create-group --group-name EKSAdmins
269 |     aws iam add-user-to-group --group-name EKSAdmins --user-name eksadmin
270 |     ```
271 | 
272 |     We are then ready to attach permissions to the group:
273 | 
274 |     ```
275 |     aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::$AWS_ACCOUNT_ID:policy/EKSAllAccess
276 |     aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::$AWS_ACCOUNT_ID:policy/IAMLimitedAccess
277 |     aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess
278 |     aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::aws:policy/AWSCloudFormationFullAccess
279 |     ```
280 | 
281 |     You are now ready to interact with the cluster using your `eksadmin` user. The rest of the interactions will occurr via your regular machine, rather than a cloud shell instance.
282 | 
283 | #### Deploy EKS cluster
284 | 
285 | The following set of operations are meant to run on your home machine (Mac or Linux only).
286 | 
287 | To provision this cluster you will need to have the following software installed:
288 | - the AWS [cli](https://aws.amazon.com/cli/)
289 | - [eksctl](https://docs.aws.amazon.com/eks/latest/userguide/eksctl.html)
290 | - [kubectl](https://kubernetes.io/docs/tasks/tools/)
291 | 
292 | 1. Set up your AWS CLI authentication
293 | 
294 |     Edit `~/.aws/config` and add your AWS access key ID and secret (your access key ID starts with `AKIA`):
295 | 
296 |     ```
297 |     [walden]
298 |     aws_access_key_id=[YOUR ACCESS KEY ID GOES HERE]
299 |     aws_secret_access_key=[YOUR ACCESS SECRET GOES HERE]
300 |     ```
301 | 
302 |     Then run:
303 |     ```
304 |     export AWS_PROFILE=walden
305 |     aws sts get-caller-identity
306 |     ```
307 | 
308 |     You should see something like:
309 |     ```
310 |     {
311 |         "UserId": "AIDA**********",
312 |         "Account": "1234567890",
313 |         "Arn": "arn:aws:iam::1234567890:user/[someusername]"
314 |     }
315 |     ```
316 | 
317 | 2. Create minimal EKS cluster using `eksctl`
318 | 
319 |     To create a minimal cluster, run:
320 |     ```
321 |     eksctl create cluster --name=eks-walden --nodes=4 --node-type=r5.large --spot
322 |     ```
323 | 
324 |     This command will create an EKS cluster, and one default node group with 4 nodes in it. This is purely a test cluster -- `eksctl` being a very powerful tool that allows you to customize your cluster whichever way you see fit.
325 | 
326 |     The command will take about 30 minutes to run, while AWS provisions requisite resources.
327 | 
328 |     Once the cluster creation has succeeded, run:
329 |     ```
330 |     kubectl get nodes
331 |     ```
332 | 
333 | #### Deploy Walden
334 | 
335 | You now have a working EKS cluster, on which you can deploy Walden just as you would on an on-premise cluster. Follow [these instructions](https://github.com/scie-nz/walden#deploy-walden) to deploy it.
336 | 
337 | #### Clean up
338 | 
339 | First, get the name of your nodegroup:
340 | ```
341 | eksctl get nodegroup --cluster eks-walden
342 | ```
343 | 
344 | Then, delete the nodegroup:
345 | ```
346 | eksctl delete nodegroup [NODEGROUP NAME GOES HERE] --cluster eks-walden
347 | ```
348 | 
349 | You can now delete your cluster:
350 | ```
351 | eksctl delete cluster eks-walden
352 | ```
353 | 
354 | Finally, you should clean up your EBS volumes. You can do so by visiting the [Volumes](https://us-west-2.console.aws.amazon.com/ec2/v2/home?#Volumes) section in your AWS console.
355 | 
356 | NOTE: please take care when cleaning your EBS volumes. You may lose data you care about. Make sure you understand what volumes you're deleting.
357 | 
358 | ### Azure
359 | 
360 | This tutorial assumes you have a working Azure account, with default quota settings. You will likely need to activate pay-as-you-go billing to be able to provision the AKS cluster described here.
361 | 
362 | #### Create AKS cluster
363 | 
364 | First, create a dedicated resource group, this example uses the `centralus` region:
365 | ```
366 | az group create --name WaldenResourceGroup -l centralus
367 | ```
368 | 
369 | You are now ready to create your cluster:
370 | ```
371 | az aks create -g WaldenResourceGroup -n WaldenAKS --node-count 5 --node-vm-size Standard_B2ms
372 | ```
373 | 
374 | To connect to the cluster:
375 | ```
376 | az aks get-credentials --resource-group WaldenResourceGroup --name WaldenAKS
377 | ```
378 | 
379 | #### Deploy Walden
380 | 
381 | You now have a working AKS cluster, on which you can deploy Walden just as you would on an on-premise cluster. Follow [these instructions](https://github.com/scie-nz/walden#deploy-walden) to deploy it.
382 | 
383 | #### Clean up
384 | 
385 | First, delete the cluster:
386 | ```
387 | az aks delete --resource-group WaldenResourceGroup --name walden
388 | ```
389 | 
390 | You can now delete the resource group:
391 | ```
392 | az group delete --resource-group WaldenResourceGroup
393 | ```
394 | 
395 | ## Advanced topics
396 | 
397 | ### Adding external data sources via Trino
398 | 
399 | External databases can be added to Walden by [connecting them to Trino](https://trino.io/docs/current/connector.html) as a separate "Catalog". The new Trino Catalog can then be [added to Superset](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers).
400 | 
401 | This strategy allows using both Trino and Superset to interact with the external data. However, some data types (such as GIS geometry columns) may not work well with Trino. In those cases you can instead connect Superset to the external database directly as described in the next section below.
402 | 
403 | 1. Create a separate Kubernetes `ConfigMap` named `trino-catalog-extra` in the `walden` namespace. This `ConfigMap` should contain one or more `.properties` files for each [Trino connector](https://trino.io/docs/current/connector.html) that you want.
404 | 2. If Walden is already deployed, restart the `trino-*` pods manually for the change to take effect.
405 |     ```
406 |     $ kubectl delete pod -n walden trino-coordinator-xxxx-yyyy trino-worker-xxxx-yyyy
407 |     ```
408 | 3. Verify that the external data source is accessible by logging in to the `devserver` pod as described above and checking for a data source named `custom`.
409 |     ```
410 |     $ kubectl exec -it -n walden deployment/devserver -- /bin/bash
411 |     # trino-cli --server trino --catalog custom
412 |     trino> SHOW SCHEMAS;
413 |     trino> DESCRIBE <schemaname>.<tablename>;
414 |     ```
415 | 
416 | Now we should be able to add the new Trino catalog to Superset:
417 | 
418 | 1. Open the Superset UI and log in as described above.
419 |     ```
420 |     $ kubectl get secret -n walden superset-admin -o 'jsonpath={.data.pass}' | base64 -d && echo
421 |     lONGpASSWoRD64HERE
422 |     $ kubectl port-forward -n walden deployment/superset 8088
423 |     ```
424 | 2. Go to `Data` > `Databases` via the top menu and click the `+ Database` on the upper right to add a new Database.
425 | 3. Select the `Trino` database type from the pull down menu.
426 | 4. Set the `SQLAlchemy URI` to `trino://trino/custom`.
427 | 5. (OPTIONAL) Switch to the `Advanced` tab and enable the following:
428 |     - SQL Lab:
429 |         - `Expose database in SQL Lab`, followed by...
430 |         - `Allow Multi Schema Metadata Fetch` (optional: don't enable if the DB is very large)
431 |         - `Enable query cost estimation`
432 |         - `Allow this database to be explored`
433 |     - Performance:
434 |         - `Asynchronous query execution`
435 | 6. Click `Connect` to create the new Database entry.
436 | 
437 | The new Database entry can be reconfigured again later if needed.
438 | 
439 | Now you can switch to `SQL Lab` > `SQL Editor` and preview the new Database, confirming that it looks as expected.
440 | 
441 | Check the [Trino](https://trino.io/docs/current/connector.html) and [Superset](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers) docs for any additional information on configuring particular database types.
442 | 
443 | ![Screenshot of Superset UI showing external PostGIS data via Trino](superset-external.png)
444 | 
445 | ### Adding external data sources (and other CLI configuration) to Superset
446 | 
447 | Instead of connecting an external database via Trino, the external database may instead be connected to Superset directly. This means the data will only be accessible via the Superset UI, and will not be accessible via Trino.
448 | 
449 | Follow the above steps for logging into the Superset UI and adding a new Database entry, except this time you should pick the type of database that you are adding, instead of Trino. The steps are otherwise similar. If your datatype isn't listed, you may need to build a custom `walden-superset` Docker image that installs the required python module(s).
450 | 
451 | Check the [Superset docs](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers) for any additional information on configuring particular database types.
452 | 
453 | If you wish to provide the additional datasources declaratively via a YAML file, you can do so with something like the following custom `ConfigMap`. The special `superset_init_custom.sh` script allows running your own custom CLI commands on Superset startup. The `superset-custom` `ConfigMap` will take effect after restarting the `superset` and `superset-worker` pods:
454 | 
455 | ```
456 | apiVersion: v1
457 | kind: ConfigMap
458 | metadata:
459 |   name: superset-custom
460 |   namespace: walden
461 | data:
462 |   superset_init_custom.sh: |
463 |     superset import_datasources -p /app/pythonpath/superset_datasources_custom.yaml
464 |   superset_datasources_custom.yaml: |
465 |     databases:
466 |     - name: my_database
467 |       ...config here...
468 | ```
469 | 
470 | ### Adding/overriding superset_config.py configuration
471 | 
472 | The provided `superset_config.py` provides a reasonable base configuration for integrating with Walden, using Postgres as the metastore and Redis as the cache.
473 | 
474 | You may want to customize this configuration, for example to configure a custom authentication provider. To do this, create your own `ConfigMap` named `superset-custom` which contains your own `superset_config.py`, and/or any other files that should be included in the same directory as `superset_config.py`. The content of your custom `superset_config.py` will be concatenated to the end of the [default Walden `superset_config.py`](kube/configs/superset_config.py), and any additional files you provide will be copied into the same directory.
475 | 
476 | Here is a minimal example of configuring custom additions to `superset_config.py`, which will take effect after restarting the `superset` and `superset-worker` pods. This can be combined with the above example of running custom superset CLI commands on pod startup:
477 | 
478 | ```
479 | apiVersion: v1
480 | kind: ConfigMap
481 | metadata:
482 |   name: superset-custom
483 |   namespace: walden
484 | data:
485 |   superset_config.py: |
486 |     print("hello world! this is a custom config")
487 |   custom_sso_security_manager.py: |
488 |     # for example, this could have your custom SupersetSecurityManager implementation
489 |     # see docs: https://superset.apache.org/docs/installation/configuring-superset/#custom-oauth2-configuration
490 | ```
491 | 
492 | Similarly, if you need to provide some credentials for your config, they can be specified in a separate `Secret` that's also named `superset-custom`:
493 | 
494 | ```
495 | apiVersion: v1
496 | kind: Secret
497 | metadata:
498 |   name: superset-custom
499 |   namespace: walden
500 | stringData:
501 |   oauth_secrets.json: |
502 |     { ... secret keys here ... }
503 | ```
504 | 
505 | ### Building images using Kaniko
506 | 
507 | Cheat sheet for building images from within an existing cluster.
508 | This can also be done locally via the Docker CLI or similar.
509 | ```
510 | kubectl create secret -n walden docker-registry regcred --docker-server=https://index.docker.io/v1/ --docker-username=[your-docker-username] --docker-password=[your-docker-password]
511 | kubectl apply -f kube-build/templates/kaniko-devserver.yaml
512 | ```
513 | 
514 | After building/pushing new release images, update the tags for the affected `image_*` defaults in [`tf/variables.tf`](tf/variables.tf).
515 | 
516 | ### Deploying with custom images
517 | 
518 | Walden can be deployed with custom images from your registry/organization.
519 | 
520 | 1. Create your own `terraform.tfvars` file with custom overrides for the `image_*` values listed under [`tf/variables.tf`](tf/variables.tf)
521 | 2. Build and push images: Run `docker/*/build.sh` and `docker/*/push.sh`
522 | 3. Deploy using custom images: Run `tf apply` under the `tf/` directory
523 | 
524 | ### Deploying more MinIO nodes
525 | 
526 | MinIO must be deployed with at least four nodes, which is the default number used by Walden.
527 | If you'd like to deploy more MinIO nodes, create a `terraform.tfvars` file with a custom override of the `minio_replicas` setting, then apply using `tf apply` under the `tf/` directory.
528 | 
529 | ### Deploying MinIO on alternate architectures
530 | 
531 | The MinIO images are multi-arch and so can be configured to run on nodes with non-`amd64` architectures.
532 | In our case, we have a mixed-architecture cluster where several `arm64` Raspberry Pis provide local storage, making them a convenient place for running the MinIO pods.
533 | To deploy with MinIO nodes on a different architecture, edit the `minio_arch` setting in your `terraform.tfvars` file.
534 | Note that we do not support custom architectures for the `walden-*` images themselves, as the underlying software doesn't deal with it well.
535 | 


--------------------------------------------------------------------------------