├── trino.png
├── kube-build
├── Chart.yaml
├── templates
│ ├── kaniko-trino.yaml
│ └── kaniko-devserver.yaml
└── deploy.sh
├── tf
├── namespace
│ ├── outputs.tf
│ ├── variables.tf
│ └── namespace.tf
├── outputs.tf
├── redis
│ ├── outputs.tf
│ ├── variables.tf
│ └── redis.tf
├── devserver
│ ├── variables.tf
│ └── devserver.tf
├── superset
│ ├── outputs.tf
│ ├── superset_init.sh
│ ├── variables.tf
│ ├── superset_config.py
│ └── superset.tf
├── postgres
│ ├── outputs.tf
│ ├── variables.tf
│ └── postgres.tf
├── minio
│ ├── outputs.tf
│ ├── variables.tf
│ └── minio.tf
├── nessie-values.yaml
├── trino-values.yaml
├── test-hive.sh
├── main.tf
└── variables.tf
├── components.png
├── superset.png
├── superset-external.png
├── docker
├── superset
│ ├── push.sh
│ ├── build.sh
│ └── Dockerfile
├── devserver
│ ├── push.sh
│ ├── build.sh
│ └── Dockerfile
└── metastore
│ ├── push.sh
│ ├── build.sh
│ └── Dockerfile
├── .gitignore
├── LICENSE
├── components.drawio
└── README.md
/trino.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/trino.png
--------------------------------------------------------------------------------
/kube-build/Chart.yaml:
--------------------------------------------------------------------------------
1 | name: walden
2 | version: 1 # required by helm
3 |
--------------------------------------------------------------------------------
/tf/namespace/outputs.tf:
--------------------------------------------------------------------------------
1 | output "name" {
2 | value = var.name
3 | }
4 |
--------------------------------------------------------------------------------
/tf/namespace/variables.tf:
--------------------------------------------------------------------------------
1 | variable "name" {
2 | type = string
3 | }
4 |
--------------------------------------------------------------------------------
/components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/components.png
--------------------------------------------------------------------------------
/superset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/superset.png
--------------------------------------------------------------------------------
/tf/outputs.tf:
--------------------------------------------------------------------------------
1 | output "namespace" {
2 | value = module.namespace.name
3 | }
4 |
--------------------------------------------------------------------------------
/superset-external.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scie-nz/walden/HEAD/superset-external.png
--------------------------------------------------------------------------------
/tf/namespace/namespace.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_namespace" "namespace" {
2 | metadata {
3 | name = var.name
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/tf/redis/outputs.tf:
--------------------------------------------------------------------------------
1 | output "pass" {
2 | value = random_password.pass.result
3 | sensitive = true
4 | }
5 | output "secret_name" {
6 | value = kubernetes_secret.redis.metadata[0].name
7 | }
8 |
--------------------------------------------------------------------------------
/tf/devserver/variables.tf:
--------------------------------------------------------------------------------
1 | variable "namespace" {
2 | type = string
3 | }
4 |
5 | variable "image" {
6 | type = string
7 | }
8 |
9 | variable "minio_secret_name" {
10 | type = string
11 | }
12 |
--------------------------------------------------------------------------------
/tf/superset/outputs.tf:
--------------------------------------------------------------------------------
1 | output "user" {
2 | value = var.username
3 | sensitive = true
4 | }
5 | output "pass" {
6 | value = var.password == "" ? random_password.admin_pass[0].result : var.password
7 | sensitive = true
8 | }
9 |
--------------------------------------------------------------------------------
/docker/superset/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either SUPERSET_TAG or TAG, default is current date
4 | TAG=${SUPERSET_TAG:=superset-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/docker/devserver/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either DEVSERVER_TAG or TAG, default is current date
4 | TAG=${DEVSERVER_TAG:=devserver-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/docker/metastore/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either METASTORE_TAG or TAG, default is current date
4 | TAG=${METASTORE_TAG:=metastore-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker push ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/docker/superset/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either SUPERSET_TAG or TAG, default is current date
4 | TAG=${SUPERSET_TAG:=superset-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/docker/devserver/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either DEVSERVER_TAG or TAG, default is current date
4 | TAG=${DEVSERVER_TAG:=devserver-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/docker/metastore/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Allow custom tag via either METASTORE_TAG or TAG, default is current date
4 | TAG=${METASTORE_TAG:=metastore-${TAG:=$(date +%Y.%m.%d)}}
5 |
6 | # Allow custom registry/org via ORG
7 | docker build . -t ${ORG:=ghcr.io/scie-nz}/walden:$TAG
8 |
--------------------------------------------------------------------------------
/tf/postgres/outputs.tf:
--------------------------------------------------------------------------------
1 | output "user" {
2 | value = random_password.user.result
3 | sensitive = true
4 | }
5 | output "pass" {
6 | value = random_password.pass.result
7 | sensitive = true
8 | }
9 | output "secret_name" {
10 | value = kubernetes_secret.postgres.metadata[0].name
11 | }
12 |
--------------------------------------------------------------------------------
/tf/postgres/variables.tf:
--------------------------------------------------------------------------------
1 | variable "namespace" {
2 | type = string
3 | }
4 |
5 | variable "name" {
6 | type = string
7 | }
8 |
9 | variable "image" {
10 | type = string
11 | }
12 |
13 | variable "db" {
14 | type = string
15 | }
16 |
17 | variable "storage" {
18 | type = string
19 | }
20 |
--------------------------------------------------------------------------------
/tf/redis/variables.tf:
--------------------------------------------------------------------------------
1 | variable "namespace" {
2 | type = string
3 | }
4 |
5 | variable "name" {
6 | type = string
7 | }
8 |
9 | variable "image" {
10 | type = string
11 | }
12 |
13 | variable "max_memory" {
14 | type = string
15 | }
16 | variable "storage" {
17 | type = string
18 | }
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore any files named "values*.yaml", except for "kube/values-default.yaml"
2 | **/values*.yaml
3 | !kube/values-default.yaml
4 |
5 | *.iml
6 | *.iws
7 | .idea/
8 |
9 | .terraform/
10 | .terraform.lock.hcl
11 | terraform.tfstate
12 | terraform.tfstate.backup
13 | .terraform.tfstate.lock.info
14 |
--------------------------------------------------------------------------------
/tf/minio/outputs.tf:
--------------------------------------------------------------------------------
1 | output "user" {
2 | value = var.username
3 | sensitive = true
4 | }
5 | output "pass" {
6 | value = var.password == "" ? random_password.admin_pass[0].result : var.password
7 | sensitive = true
8 | }
9 | output "secret_name" {
10 | value = kubernetes_secret.minio.metadata[0].name
11 | }
12 |
--------------------------------------------------------------------------------
/tf/nessie-values.yaml:
--------------------------------------------------------------------------------
1 | versionStoreType: JDBC2
2 | jdbc:
3 | jdbcUrl: "${jdbc_url}"
4 | secret:
5 | name: "${jdbc_secret_name}"
6 | username: user
7 | password: pass
8 | catalog:
9 | iceberg:
10 | defaultWarehouse: warehouse
11 | warehouses:
12 | - name: warehouse
13 | location: "s3://demobucket/"
14 | storage:
15 | s3:
16 | defaultOptions:
17 | endpoint: "http://minio:9000/"
18 | pathStyleAccess: true
19 | accessKeySecret:
20 | name: minio
21 | awsAccessKeyId: user
22 | awsSecretAccessKey: pass
23 |
--------------------------------------------------------------------------------
/tf/trino-values.yaml:
--------------------------------------------------------------------------------
1 | server:
2 | workers: ${worker_replicas}
3 | coordinator:
4 | nodeSelector: ${coordinator_node_selector}
5 | config:
6 | query:
7 | maxMemoryPerNode: ${coordinator_query_mem_limit}
8 | resources:
9 | limits:
10 | memory: ${coordinator_mem_limit}
11 | requests:
12 | memory: ${coordinator_mem_limit}
13 | jvm:
14 | maxHeapSize: ${coordinator_max_heap}
15 | worker:
16 | nodeSelector: ${worker_node_selector}
17 | config:
18 | query:
19 | maxMemoryPerNode: ${worker_query_mem_limit}
20 | resources:
21 | limits:
22 | memory: ${worker_mem_limit}
23 | requests:
24 | memory: ${worker_mem_limit}
25 | jvm:
26 | maxHeapSize: ${worker_max_heap}
27 |
--------------------------------------------------------------------------------
/tf/superset/superset_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -eu
3 |
4 | echo "Upgrading DB schema..."
5 | superset db upgrade
6 |
7 | echo "Initializing roles..."
8 | superset init
9 |
10 | if [ -n "$ADMIN_USER" -a -n "$ADMIN_PASS" ]; then
11 | echo "Creating admin user '${ADMIN_USER}'..."
12 | superset fab create-admin \
13 | --username "${ADMIN_USER}" \
14 | --firstname Superset \
15 | --lastname Admin \
16 | --email admin@superset.com \
17 | --password "${ADMIN_PASS}" \
18 | || true
19 | fi
20 |
21 | if [ -f "/app/pythonpath/superset_datasources.yaml" ]; then
22 | echo "Importing database connections..."
23 | superset import_datasources -p /app/pythonpath/superset_datasources.yaml
24 | fi
25 |
--------------------------------------------------------------------------------
/kube-build/templates/kaniko-trino.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Pod
4 | metadata:
5 | name: kaniko
6 | namespace: walden
7 | spec:
8 | nodeSelector:
9 | kubernetes.io/arch: amd64
10 | containers:
11 | - name: kaniko
12 | image: gcr.io/kaniko-project/executor:debug
13 | args:
14 | - "--dockerfile=Dockerfile"
15 | - "--context-sub-path=docker/trino"
16 | - "--context=git://github.com/scie-nz/walden.git"
17 | - "--destination=scienz/walden-trino"
18 | volumeMounts:
19 | - name: docker-config
20 | mountPath: /kaniko/.docker
21 | volumes:
22 | - name: docker-config
23 | projected:
24 | sources:
25 | - secret:
26 | name: regcred
27 | items:
28 | - key: .dockerconfigjson
29 | path: config.json
30 | restartPolicy: Never
31 |
--------------------------------------------------------------------------------
/kube-build/templates/kaniko-devserver.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Pod
4 | metadata:
5 | name: kaniko
6 | namespace: walden
7 | spec:
8 | nodeSelector:
9 | kubernetes.io/arch: amd64
10 | containers:
11 | - name: kaniko
12 | image: gcr.io/kaniko-project/executor:debug
13 | args:
14 | - "--dockerfile=Dockerfile"
15 | - "--context-sub-path=docker/devserver"
16 | - "--context=git://github.com/scie-nz/walden.git"
17 | - "--destination=scienz/walden-devserver"
18 | volumeMounts:
19 | - name: docker-config
20 | mountPath: /kaniko/.docker
21 | volumes:
22 | - name: docker-config
23 | projected:
24 | sources:
25 | - secret:
26 | name: regcred
27 | items:
28 | - key: .dockerconfigjson
29 | path: config.json
30 | restartPolicy: Never
31 |
--------------------------------------------------------------------------------
/tf/test-hive.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # script to run in devserver pod
4 | read -d '' EXEC_SCRIPT << EOF
5 | echo "mc alias"
6 | mc alias set walden-minio/ http://minio:9000 \$MINIO_ACCESS_KEY_ID \$MINIO_ACCESS_KEY_SECRET || exit 1
7 |
8 | echo "mc rb"
9 | mc rb --force walden-minio/direct
10 |
11 | echo "mc mb"
12 | mc mb walden-minio/direct
13 |
14 | trino_cmd() {
15 | echo "trino: \$1"
16 | trino-cli --server trino --user walden --execute="\$1"
17 | }
18 |
19 | trino_cmd "DROP TABLE IF EXISTS hive.direct.dim_foo"
20 | trino_cmd "DROP SCHEMA IF EXISTS hive.direct"
21 | trino_cmd "CREATE SCHEMA hive.direct WITH (location='s3a://direct/')"
22 | trino_cmd "CREATE TABLE hive.direct.dim_foo(key VARCHAR, val BIGINT)"
23 | trino_cmd "INSERT INTO hive.direct.dim_foo VALUES ('this', 1), ('is', 2), ('a', 3), ('test', 4)"
24 | trino_cmd "SELECT key, val FROM hive.direct.dim_foo"
25 |
26 | echo "mc ls"
27 | mc ls -r walden-minio/direct
28 | EOF
29 |
30 | kubectl exec -it -n walden deployment/devserver -- /bin/bash -c "$EXEC_SCRIPT"
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 SCIE.NZ
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/kube-build/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 |
5 | # strict mode
6 | set -euo pipefail
7 | IFS=$'\n\t'
8 |
9 | # print line on error
10 | err_report() {
11 | echo "Error on line $1"
12 | }
13 | trap 'err_report $LINENO' ERR
14 |
15 | # set namespace, then reset back to current afterwards
16 | # this allows us to apply across namespaces in a single 'apply' command, while still having an assigned default
17 | TARGET_NAMESPACE=walden
18 | ORIG_NAMESPACE=$(kubectl config view --minify --output 'jsonpath={..namespace}')
19 | if [ -z "$ORIG_NAMESPACE" ]; then
20 | ORIG_NAMESPACE=$TARGET_NAMESPACE
21 | fi
22 |
23 | reset_namespace() {
24 | echo "Switching back to namespace: $ORIG_NAMESPACE"
25 | kubectl config set-context --current --namespace=$ORIG_NAMESPACE
26 | }
27 | trap reset_namespace EXIT
28 |
29 | echo "Switching to namespace: $TARGET_NAMESPACE"
30 | # if namespace doesn't exist, create it
31 | kubectl create namespace $TARGET_NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
32 | kubectl config set-context --current --namespace=$TARGET_NAMESPACE
33 |
34 | helm template -g $SCRIPT_DIR | kubectl apply -f -
35 |
--------------------------------------------------------------------------------
/tf/minio/variables.tf:
--------------------------------------------------------------------------------
1 | variable "namespace" {
2 | type = string
3 | }
4 |
5 | variable "name" {
6 | type = string
7 | }
8 |
9 | variable "image" {
10 | type = string
11 | }
12 |
13 | variable "username" {
14 | type = string
15 | validation {
16 | condition = length(var.username) >= 3
17 | error_message = "Minio username must be at least 3 characters long"
18 | }
19 | }
20 | variable "password" {
21 | type = string
22 | validation {
23 | condition = var.password == "" || length(var.password) >= 8
24 | error_message = "Minio password must be at least 8 characters long"
25 | }
26 | }
27 | variable "replicas" {
28 | type = number
29 | validation {
30 | condition = var.replicas == 1 || var.replicas >= 4
31 | error_message = "Minio requires a minimum of four replicas"
32 | }
33 | }
34 | variable "mem_limit" {
35 | type = string
36 | }
37 | variable "node_selector" {
38 | type = map
39 | }
40 | variable "tolerations" {
41 | type = list(object({
42 | effect = string
43 | key = string
44 | operator = string
45 | value = string
46 | }))
47 | }
48 | variable "storage" {
49 | type = string
50 | }
51 | variable "nfs_server" {
52 | type = string
53 | }
54 | variable "nfs_path" {
55 | type = string
56 | }
57 |
--------------------------------------------------------------------------------
/docker/superset/Dockerfile:
--------------------------------------------------------------------------------
1 | # Based on the latest stable release
2 | # check https://hub.docker.com/r/apache/superset/tags (search for '1.' to get releases)
3 | FROM apache/superset:4.1.1
4 |
5 | USER root
6 | # Geckodriver prerequisites
7 | RUN apt-get update \
8 | && apt-get -y install --no-install-recommends firefox-esr \
9 | && apt-get -y upgrade \
10 | && apt-get clean \
11 | && rm -rf /var/lib/apt/lists /var/cache/apt/archives
12 |
13 | # Install geckodriver to support rendering charts in email alerts
14 | ENV GECKODRIVER_VERSION=0.35.0
15 | RUN wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz \
16 | && tar -x geckodriver -zf geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz -O > /usr/bin/geckodriver \
17 | && rm geckodriver-v${GECKODRIVER_VERSION}-linux64.tar.gz \
18 | && chmod 755 /usr/bin/geckodriver \
19 | && geckodriver --version
20 | USER superset
21 |
22 | # Install a selection of drivers for connecting Superset to various database types and auth integrations.
23 | #
24 | # See here for full list of supported DBs and their connection strings:
25 | # https://superset.apache.org/docs/configuration/databases/
26 | #
27 | # - authlib + flask-oidc: Support OIDC auth
28 | # - trino: Support talking to Trino in Walden
29 | RUN pip3 install --break-system-packages \
30 | authlib==1.4.0 \
31 | flask-oidc==2.2.2 \
32 | \
33 | trino==0.332.0
34 |
--------------------------------------------------------------------------------
/docker/devserver/Dockerfile:
--------------------------------------------------------------------------------
1 | # - Trino wants java 23: https://trino.io/docs/current/installation/deployment.html
2 | # - Ubuntu only has openjdk-23-jre in 24.10+ (not in 24.04LTS): https://packages.ubuntu.com/search?keywords=openjdk-23-jre&searchon=names&suite=all§ion=all
3 | # So we go with 24.10:
4 | FROM ubuntu:24.10
5 |
6 | ENV DEBIAN_FRONTEND=noninteractive
7 |
8 | RUN apt-get update \
9 | && apt-get -y install curl git gnupg2 less openjdk-23-jre python3-pip tzdata unzip vim wget \
10 | && apt-get -y upgrade \
11 | && apt-get clean \
12 | && rm -rf /var/lib/apt/lists /var/cache/apt/archives
13 |
14 | ENV TRINO_VERSION=468 \
15 | TRINO_PY_VERSION=0.332.0 \
16 | MINIO_VERSION=RELEASE.2025-01-17T23-25-50Z \
17 | MINIO_PY_VERSION=7.2.15
18 |
19 | RUN \
20 | curl -o /usr/bin/mc https://dl.min.io/client/mc/release/linux-amd64/archive/mc.${MINIO_VERSION} \
21 | && chmod +x /usr/bin/mc \
22 | && mc --version \
23 | \
24 | && curl -o /usr/bin/trino-cli https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar \
25 | && echo 'trino-cli --server trino --user "${TRINO_USER}" --catalog hive --schema $1' > /usr/bin/trino \
26 | && chmod +x /usr/bin/trino-cli /usr/bin/trino \
27 | && trino-cli --version \
28 | \
29 | && pip3 install --break-system-packages minio==${MINIO_PY_VERSION} trino==${TRINO_PY_VERSION} \
30 | && python3 -c "import minio; print('minio {}'.format(minio.__version__))" \
31 | && python3 -c "import trino; print('trino {}'.format(trino.__version__))"
32 |
--------------------------------------------------------------------------------
/components.drawio:
--------------------------------------------------------------------------------
1 | 7Vpbb9owFP41PHZKYhLCY4FunUbVdmxqu5fJTdzgYeLIMQX262c3DoljKGkVbtVekH1iO/Z3vnNzaIH+dPGFwWR8RUNEWo4VLlpg0HKcjuWIXylYZgK/AzJBxHCYiexCMMJ/kRJaSjrDIUq1gZxSwnGiCwMaxyjgmgwyRuf6sCdK9LcmMEKGYBRAYkrvcMjH6hSuVcgvEY7G+ZttSz2ZwnywEqRjGNJ5SQQuWqDPKOVZa7roIyKxy3HJ5n3e8HS1MYZiXmfCN3/4C/yGwblzffazfX0bXkySM6ebLfMMyUydWO2WL3MIGJ3FIZKrWC3Qm48xR6MEBvLpXOhcyMZ8SkTPFk21HGIcLTZu1F4dX9AG0SnibCmGqAleRyGmKOPmkM4LBdg5quMy+PlAqJQerdYucBENBc1bYLKOD6ZOW4cJeAeHyfaPDybX02Gyu3VhAjtjk7sdJhQKN6S6lPExjWgMyUUh7elAFmOGlCYKvj+I86XyqXDG6Tpw5Yteh1bsi85YgF5TvPLMkEWIvzbOWa8rhgjk+FnfSPP0NGD/wXBMW45HxKZ7j0y0ItnqU8pCHENOmaGYLYyFaZKFoSe8kMpphML2dkv31zDY3xmBDSBvaMojhka3QwMwEfcS2QyWBAvCMrDd7B8zag8fVwIYTKIXwl/PuFgGKXmacdt2G3KpFV8B1viKdUB7uwLadBQjwUmZspwwyl63Et/BgVHufFA6u74OtOMeGGgzPfiOQpx+KIydA2Nsm6nqhih3R9kEMRP9A0Q4r5rL+geOcHbtXOGYUAQVFDuHRtFMFE4ARefIsi3bjE+jWSLQEql2FS9xbK5DlHJGJ6hPichlwSCm0hH2njAhFREkOIqlzxUwISHvSRBxAMm5ejDFYfhSeKzTiV6MNOFXa5Rt3hot7K5oMz3CpaxYHOsKcZiKtMxMyj6MNurcNXT3qo0aNw0oDs/lDaCEkcA0xYGuCx0lgQRb3pc7D7Lzyc27g0X54WCZ9xaY36sVZbs0S/SKSbKz1DTSTOVdt/Qu3TlqGi5p0F2jwVxWu0JXb7ihWBxlYwnbrjIjO6ia5ZRuMSsLVS8HvarbzYAwFnph2erY7yceMNOrKxx/vf64tl/nOnavnhiYnrgx2//UcTXzr2n8hcE/lFzB7o3fr2n8GzR8YrZfzQnAvm3fTGebI55zSsQD4CSIZ+l8WaUobyaef2DiAYN4A/ScIiYcu1lLJTQ0aLnvry7tiqk6NYuonV2LgPb/oKF/y9yeMXaPKmo4TUWNzp6Nd4elymlFjfq1ynExr/1e5rmVzNluinmiW/xxJBte/PsGXPwD
--------------------------------------------------------------------------------
/tf/superset/variables.tf:
--------------------------------------------------------------------------------
1 | variable "namespace" {
2 | type = string
3 | }
4 |
5 | variable "image_busybox" {
6 | type = string
7 | }
8 | variable "image_superset" {
9 | type = string
10 | }
11 |
12 | variable "username" {
13 | type = string
14 | }
15 | variable "password" {
16 | type = string
17 | }
18 | variable "worker_replicas" {
19 | type = number
20 | }
21 | variable "mem_limit_server" {
22 | type = string
23 | }
24 | variable "mem_limit_worker" {
25 | type = string
26 | }
27 |
28 | variable "postgres_host" {
29 | type = string
30 | }
31 | variable "postgres_port" {
32 | type = number
33 | }
34 | variable "postgres_db" {
35 | type = string
36 | }
37 | variable "postgres_secret_name" {
38 | type = string
39 | }
40 |
41 | variable "redis_host" {
42 | type = string
43 | }
44 | variable "redis_secret_name" {
45 | type = string
46 | }
47 |
48 | variable "extra_datasources" {
49 | type = string
50 | }
51 |
52 | variable "scheduler_node_selector" {
53 | type = map
54 | }
55 | variable "worker_node_selector" {
56 | type = map
57 | }
58 | variable "app_node_selector" {
59 | type = map
60 | }
61 | variable "scheduler_tolerations" {
62 | type = list(object({
63 | effect = string
64 | key = string
65 | operator = string
66 | value = string
67 | }))
68 | }
69 | variable "worker_tolerations" {
70 | type = list(object({
71 | effect = string
72 | key = string
73 | operator = string
74 | value = string
75 | }))
76 | }
77 | variable "app_tolerations" {
78 | type = list(object({
79 | effect = string
80 | key = string
81 | operator = string
82 | value = string
83 | }))
84 | }
85 |
--------------------------------------------------------------------------------
/tf/devserver/devserver.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_persistent_volume_claim" "devserver" {
2 | metadata {
3 | name = "devserver"
4 | namespace = var.namespace
5 | }
6 | spec {
7 | access_modes = [
8 | "ReadWriteOnce",
9 | ]
10 | resources {
11 | requests = {
12 | storage = "1Gi"
13 | }
14 | }
15 | }
16 | }
17 |
18 | resource "kubernetes_deployment" "devserver" {
19 | metadata {
20 | name = "devserver"
21 | namespace = var.namespace
22 | }
23 | spec {
24 | selector {
25 | match_labels = {
26 | app = "devserver"
27 | }
28 | }
29 | strategy {
30 | type = "Recreate"
31 | }
32 | template {
33 | metadata {
34 | labels = {
35 | app = "devserver"
36 | }
37 | }
38 | spec {
39 | container {
40 | command = [
41 | "/bin/bash",
42 | "-c",
43 | "cd ~ && sleep infinity",
44 | ]
45 | env {
46 | name = "MINIO_ACCESS_KEY_ID"
47 | value_from {
48 | secret_key_ref {
49 | key = "user"
50 | name = var.minio_secret_name
51 | }
52 | }
53 | }
54 | env {
55 | name = "MINIO_ACCESS_KEY_SECRET"
56 | value_from {
57 | secret_key_ref {
58 | key = "pass"
59 | name = var.minio_secret_name
60 | }
61 | }
62 | }
63 | image = var.image
64 | name = "devserver"
65 | resources {
66 | limits = {
67 | memory = "4096Mi"
68 | }
69 | requests = {
70 | memory = "2048Mi"
71 | }
72 | }
73 | volume_mount {
74 | mount_path = "/root"
75 | name = "devserver-persistent-storage"
76 | }
77 | }
78 | node_selector = {
79 | "kubernetes.io/arch" = "amd64"
80 | }
81 | volume {
82 | name = "devserver-persistent-storage"
83 | persistent_volume_claim {
84 | claim_name = "devserver"
85 | }
86 | }
87 | }
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/docker/metastore/Dockerfile:
--------------------------------------------------------------------------------
1 | # Loosely based on: https://techjogging.com/standalone-hive-metastore-presto-docker.html
2 |
3 | # Use current LTS
4 | FROM ubuntu:22.04
5 |
6 | ENV HADOOP_VERSION=3.3.1 \
7 | METASTORE_VERSION=3.1.3 \
8 | POSTGRES_JDBC_VERSION=42.3.2 \
9 | AWS_SDK_JAR_VERSION=1.11.901 \
10 | HADOOP_HOME=/opt/hadoop \
11 | HIVE_HOME=/opt/hive-metastore \
12 | DEBIAN_FRONTEND="noninteractive"
13 |
14 | RUN mkdir -p $HIVE_HOME/lib \
15 | && mkdir -p $HADOOP_HOME \
16 | && chmod a+rw $HIVE_HOME \
17 | && chmod a+rw $HIVE_HOME/lib \
18 | && chmod a+rw $HADOOP_HOME \
19 | \
20 | && apt-get update \
21 | && apt-get -y install gnupg2 curl openjdk-8-jre \
22 | && apt-get clean \
23 | && rm -rf /var/lib/apt/lists /var/cache/apt/archives \
24 | \
25 | && curl -L https://repo1.maven.org/maven2/org/apache/hive/hive-standalone-metastore/${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - \
26 | && mv apache-hive-metastore-${METASTORE_VERSION}-bin/* $HIVE_HOME \
27 | && rmdir -v apache-hive-metastore-${METASTORE_VERSION}-bin \
28 | \
29 | && curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - \
30 | && mv -v hadoop-${HADOOP_VERSION}/* $HADOOP_HOME \
31 | && rmdir -v hadoop-${HADOOP_VERSION} \
32 | \
33 | && curl -O https://jdbc.postgresql.org/download/postgresql-${POSTGRES_JDBC_VERSION}.jar \
34 | && mv -v postgresql-${POSTGRES_JDBC_VERSION}.jar $HIVE_HOME/lib/ \
35 | \
36 | && stat $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar \
37 | && cp -v $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar $HIVE_HOME/lib/ \
38 | && cp -v $HADOOP_HOME/share/hadoop/tools/lib/hadoop-aws-$HADOOP_VERSION.jar $HIVE_HOME/lib/ \
39 | \
40 | && rm -v $HIVE_HOME/lib/guava-*.jar \
41 | && stat $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar \
42 | && cp -v $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/
43 |
44 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
45 | HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HIVE_HOME/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar:$HIVE_HOME/lib/hadoop-aws-$HADOOP_VERSION.jar:$HIVE_HOME/lib/postgresql-$POSTGRES_JDBC_VERSION.jar \
46 | METASTORE_AUX_JARS_PATH=$HIVE_HOME/lib/aws-java-sdk-bundle-$AWS_SDK_JAR_VERSION.jar:$HIVE_HOME/lib/hadoop-aws-$HADOOP_VERSION.jar
47 |
--------------------------------------------------------------------------------
/tf/superset/superset_config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from celery.schedules import crontab
3 | from cachelib.redis import RedisCache
4 |
5 | def env(key, default=None):
6 | val = os.getenv(key, default)
7 | if val is None:
8 | raise Exception("Missing required envvar: {}".format(key))
9 | return val
10 |
11 | MAPBOX_API_KEY = env('MAPBOX_API_KEY', '')
12 | CACHE_CONFIG = {
13 | 'CACHE_TYPE': 'redis',
14 | 'CACHE_DEFAULT_TIMEOUT': 300,
15 | 'CACHE_KEY_PREFIX': 'superset_',
16 | 'CACHE_REDIS_HOST': env('REDIS_HOST'),
17 | 'CACHE_REDIS_PORT': env('REDIS_PORT', 6379),
18 | 'CACHE_REDIS_PASSWORD': env('REDIS_PASSWORD'),
19 | 'CACHE_REDIS_DB': env('REDIS_CACHE_DB', 1),
20 | }
21 | DATA_CACHE_CONFIG = CACHE_CONFIG
22 | EXPLORE_FORM_DATA_CACHE_CONFIG = CACHE_CONFIG
23 | FILTER_STATE_CACHE_CONFIG = CACHE_CONFIG
24 |
25 | SQLALCHEMY_DATABASE_URI = f"postgresql+psycopg2://{env('POSTGRES_USER')}:{env('POSTGRES_PASSWORD')}@{env('POSTGRES_HOST')}:{env('POSTGRES_PORT', 5432)}/{env('POSTGRES_DB', 'superset')}"
26 | SQLALCHEMY_TRACK_MODIFICATIONS = True
27 | SECRET_KEY = env('SECRET_KEY')
28 |
29 | # Flask-WTF flag for CSRF
30 | WTF_CSRF_ENABLED = True
31 | # Add endpoints that need to be exempt from CSRF protection
32 | WTF_CSRF_EXEMPT_LIST = []
33 | # A CSRF token that expires in 1 year
34 | WTF_CSRF_TIME_LIMIT = 60 * 60 * 24 * 365
35 |
36 | class CeleryConfig(object):
37 | BROKER_URL = f"redis://:{env('REDIS_PASSWORD')}@{env('REDIS_HOST')}:{env('REDIS_PORT', 6379)}/{env('REDIS_CELERY_DB', 0)}"
38 | CELERY_IMPORTS = ('superset.sql_lab', )
39 | CELERY_RESULT_BACKEND = f"redis://:{env('REDIS_PASSWORD')}@{env('REDIS_HOST')}:{env('REDIS_PORT', 6379)}/{env('REDIS_RESULTS_DB', 1)}"
40 | CELERYD_LOG_LEVEL = "DEBUG"
41 | CELERYD_PREFETCH_MULTIPLIER = 1
42 | CELERY_ACKS_LATE = True
43 | CELERY_ANNOTATIONS = {
44 | 'tasks.add': {
45 | 'rate_limit': '10/s'
46 | },
47 | 'sql_lab.get_sql_results': {
48 | 'rate_limit': '100/s',
49 | },
50 | 'email_reports.send': {
51 | 'rate_limit': '1/s',
52 | 'time_limit': 600,
53 | 'soft_time_limit': 600,
54 | 'ignore_result': True,
55 | }
56 | }
57 | CELERYBEAT_SCHEDULE = {
58 | 'reports.scheduler': {
59 | 'task': 'reports.scheduler',
60 | 'schedule': crontab(minute='*', hour='*'),
61 | },
62 | 'reports.prune_log': {
63 | 'task': 'reports.prune_log',
64 | 'schedule': crontab(minute=0, hour=0),
65 | }
66 | }
67 | CELERY_CONFIG = CeleryConfig
68 |
69 | RESULTS_BACKEND = RedisCache(
70 | host=env('REDIS_HOST'),
71 | port=env('REDIS_PORT', 6379),
72 | password=env('REDIS_PASSWORD'),
73 | db=env('REDIS_CELERY_DB', 0),
74 | key_prefix='superset_results'
75 | )
76 |
77 | ####
78 | # User superset config begins here
79 | ####
80 |
--------------------------------------------------------------------------------
/tf/redis/redis.tf:
--------------------------------------------------------------------------------
1 | resource "random_password" "pass" {
2 | length = 32
3 | special = false
4 | }
5 |
6 | resource "kubernetes_secret" "redis" {
7 | metadata {
8 | labels = {
9 | app = var.name
10 | }
11 | name = var.name
12 | namespace = var.namespace
13 | }
14 | type = "Opaque"
15 | data = {
16 | pass = random_password.pass.result
17 | }
18 | }
19 |
20 | resource "kubernetes_service" "redis" {
21 | metadata {
22 | labels = {
23 | app = var.name
24 | }
25 | name = var.name
26 | namespace = var.namespace
27 | }
28 | spec {
29 | port {
30 | name = "redis"
31 | port = 6379
32 | target_port = "redis"
33 | }
34 | selector = {
35 | app = var.name
36 | }
37 | type = "ClusterIP"
38 | }
39 | }
40 |
41 | resource "kubernetes_stateful_set" "redis" {
42 | metadata {
43 | labels = {
44 | app = var.name
45 | }
46 | name = var.name
47 | namespace = var.namespace
48 | }
49 | spec {
50 | replicas = 1
51 | selector {
52 | match_labels = {
53 | app = var.name
54 | }
55 | }
56 | service_name = var.name
57 | template {
58 | metadata {
59 | labels = {
60 | app = var.name
61 | }
62 | }
63 | spec {
64 | container {
65 | command = [
66 | "/bin/sh",
67 | "-c",
68 | "redis-server --bind 0.0.0.0 --requirepass $REDIS_PASSWORD --loglevel $LOG_LEVEL --dir /data --maxmemory ${var.max_memory} --maxmemory-policy allkeys-lru --lazyfree-lazy-eviction yes --lazyfree-lazy-expire yes --io-threads 3",
69 | ]
70 | env {
71 | name = "LOG_LEVEL"
72 | value = "notice"
73 | }
74 | env {
75 | name = "REDIS_PASSWORD"
76 | value_from {
77 | secret_key_ref {
78 | key = "pass"
79 | name = kubernetes_secret.redis.metadata[0].name
80 | }
81 | }
82 | }
83 | image = var.image
84 | name = "server"
85 | port {
86 | container_port = 6379
87 | name = "redis"
88 | }
89 | startup_probe {
90 | initial_delay_seconds = 5
91 | period_seconds = 10
92 | tcp_socket {
93 | port = "redis"
94 | }
95 | }
96 | volume_mount {
97 | mount_path = "/data"
98 | name = "storage"
99 | }
100 | }
101 | }
102 | }
103 | volume_claim_template {
104 | metadata {
105 | name = "storage"
106 | }
107 | spec {
108 | access_modes = [
109 | "ReadWriteOnce",
110 | ]
111 | resources {
112 | requests = {
113 | storage = var.storage
114 | }
115 | }
116 | }
117 | }
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/tf/main.tf:
--------------------------------------------------------------------------------
1 | provider "kubernetes" {
2 | config_path = "~/.kube/config"
3 | }
4 |
5 | provider "helm" {
6 | kubernetes {
7 | config_path = "~/.kube/config"
8 | }
9 | }
10 |
11 | terraform {
12 | required_providers {
13 | helm = {
14 | source = "hashicorp/helm"
15 | version = "2.17.0"
16 | }
17 | kubernetes = {
18 | source = "hashicorp/kubernetes"
19 | version = "2.35.1"
20 | }
21 | random = {
22 | source = "hashicorp/random"
23 | version = "3.6.3"
24 | }
25 | }
26 | }
27 |
28 | module "namespace" {
29 | source = "./namespace"
30 |
31 | name = var.namespace
32 | }
33 |
34 | module "nessie_postgres" {
35 | count = var.nessie_postgres_internal ? 1 : 0
36 | source = "./postgres"
37 |
38 | namespace = module.namespace.name
39 | name = "nessie-postgres"
40 | image = var.image_postgres
41 |
42 | db = "nessie"
43 | storage = "1Gi"
44 | }
45 |
46 | resource "helm_release" "nessie" {
47 | name = "nessie"
48 | namespace = module.namespace.name
49 | repository = "https://charts.projectnessie.org/"
50 | chart = "nessie"
51 | # latest from https://projectnessie.org/nessie-latest/
52 | version = "0.102.2"
53 |
54 | values = [
55 | templatefile("${path.module}/nessie-values.yaml", {
56 | jdbc_url = var.nessie_postgres_internal ? "jdbc:postgresql://nessie-postgres:5432/nessie" : var.nessie_postgres_url
57 | jdbc_secret_name = var.nessie_postgres_internal ? "nessie-postgres" : ""
58 | })
59 | ]
60 | }
61 |
62 | module "minio" {
63 | source = "./minio"
64 |
65 | namespace = module.namespace.name
66 | name = "minio"
67 | image = var.image_minio
68 |
69 | username = var.minio_username
70 | password = var.minio_password
71 | replicas = var.minio_replicas
72 | mem_limit = var.minio_mem_limit
73 |
74 | node_selector = var.minio_node_selector
75 | tolerations = var.minio_tolerations
76 |
77 | storage = "1Gi"
78 | nfs_server = ""
79 | nfs_path = ""
80 | }
81 |
82 | resource "helm_release" "trino" {
83 | name = "trino"
84 | namespace = module.namespace.name
85 | repository = "https://trinodb.github.io/charts"
86 | chart = "trino"
87 | # latest from https://github.com/trinodb/charts/
88 | version = "1.36.0"
89 |
90 | values = [
91 | templatefile("${path.module}/trino-values.yaml", {
92 | catalogs = yamlencode(merge({
93 | iceberg = < CREATE SCHEMA IF NOT EXISTS direct WITH (location='s3a://direct/');
105 | CREATE SCHEMA
106 | ```
107 |
108 | If you run `SHOW SCHEMAS` you should see:
109 | ```
110 | trino:direct> SHOW SCHEMAS;
111 | Schema
112 | --------------------
113 | default
114 | direct
115 | information_schema
116 | (3 rows)
117 | ```
118 |
119 | Now we can create a table and store some data:
120 | ```
121 | trino:direct> CREATE TABLE IF NOT EXISTS dim_foo(key VARCHAR, val BIGINT);
122 | CREATE TABLE
123 |
124 | trino:direct> INSERT INTO dim_foo VALUES ('this', 1), ('is', 2), ('a', 3), ('test', 4);
125 | INSERT: 4 rows
126 | ```
127 |
128 | Assuming everything is working, you should be able to query the stored values:
129 | ```
130 | trino:direct> SELECT key, val FROM dim_foo;
131 | key | val
132 | ------+-----
133 | this | 1
134 | is | 2
135 | a | 3
136 | test | 4
137 | (4 rows)
138 | ```
139 |
140 | Now we can press `Ctrl+D` to exit the Trino console session, and look at the files created in the underlying MinIO bucket we created earlier:
141 | ```
142 | trino:direct> ^D
143 |
144 | devserver# mc ls -r walden-minio/direct
145 | [2022-03-11 06:22:24 UTC] 0B STANDARD dim_foo/
146 | [2022-03-11 06:21:42 UTC] 356B STANDARD 20220311_062141_00005_26e8n_9d96d247-6da3-49f9-a537-b0bc897879b9
147 | ```
148 |
149 | We can clean up our test data by deleting the table and then the schema:
150 | ```
151 | devserver# trino direct
152 |
153 | trino:direct> DROP TABLE dim_foo;
154 | DROP TABLE
155 | trino:direct> DROP SCHEMA direct;
156 | DROP SCHEMA
157 | trino:direct> ^D
158 |
159 | devserver# mc ls walden-minio/direct
160 |
161 | ```
162 |
163 | ### Explore data with Superset
164 |
165 | Superset provides a convienient UI for exploring the data that you've stored in Trino.
166 | Walden includes an instance of Superset that's preconfigured to connect to Trino.
167 |
168 | Superset has been configured with a `walden` user and a randomly generated password.
169 |
170 | Get the password for logging into Superset:
171 | ```
172 | $ kubectl get secret -n walden superset-admin -o 'jsonpath={.data.pass}' | base64 -d && echo
173 | lONGpASSWoRD64HERE
174 | ```
175 |
176 | Set up a port-forward to access Superset on port `8088`:
177 | ```
178 | $ kubectl port-forward -n walden deployment/superset 8088
179 | ```
180 |
181 | Go to [http://127.0.0.1:8088/](`http://127.0.0.1:8088/`) and log in with user=`walden` and the password you got earlier.
182 |
183 | Two Trino databases should have been automatically added to Superset by Walden:
184 | - `walden-trino` has Trino data, including what we added to a `test` schema in earlier steps. The data itself is being stored to Minio in Hive columnar table format, with the Metastore acting as the index.
185 | - `walden-trino-system` has various Trino System statistics like node connectivity that may also be interesting to explore.
186 | In addition to these autoconfigured, you should be able to add other external databases to Superset as well via `Data` > `Databases` in the top menu.
187 |
188 | We can start exploring by going to `SQL Lab` > `SQL Editor` in the top menu.
189 | In the SQL Editor view, select the `walden-trino` database, then the `test` schema that we created earlier.
190 | The values we added to this schema earlier should also be visible via Superset.
191 |
192 | 
193 |
194 | ### View the Trino Dashboard
195 |
196 | Trino also provides a dashboard UI showing basic activity. Set up a port-forward to access the dashboard on port `8080`:
197 | ```
198 | $ kubectl port-forward -n walden deployment/trino-coordinator 8080:80
199 | ```
200 |
201 | Go to [http://127.0.0.1:8080](`http://127.0.0.1:8080`) and log in with any value for the username (it doesn't matter).
202 |
203 | 
204 |
205 | ## Conclusions
206 |
207 | That's it, this is an easy way to get a small data lake working.
208 | This is meant to be a fully functional starting point that can be expanded and customized to fit your needs.
209 | Everything here is provided as-is, so your mileage may vary.
210 | Please report any bugs or issues and we will try to get to them.
211 |
212 | ## Cloud Provider Installation
213 |
214 | Walden can be used either on-premise or in hosted Kubernetes environments.
215 | Here are some example steps for setting up Walden in various cloud providers.
216 |
217 | ### AWS
218 |
219 | This tutorial assumes you already have an AWS account set up. Instructions should be run from either Mac or Linux machines. Also keep in mind this is likely to cost a few dollars in AWS costs to try out. We've been able to keep costs below $5 USD when running a minimal cluster for a short amount of time.
220 |
221 | #### Configure AWS EKS Admin
222 |
223 | To manage the EKS cluster programmatically from your local machine, you will need to create a new AWS IAM user and grant it appropriate permissions.
224 |
225 | The easiest way to do so reproducibly involves using a "CloudShell" session. To bring one up, search for "CloudShell" in your AWS console. Note that you should be logged in with your AWS root account when running these operations.
226 |
227 | 1. Create IAM user
228 |
229 | Run the following in your AWS cloud shell session
230 | ```
231 | aws iam create-user --user-name eksadmin
232 | ```
233 | **Note**: you can skip this step if you already have an IAM user you would like to use. Simply replace `eksadmin` with your user name where necessary.
234 |
235 | 2. Create IAM policies
236 |
237 | To be able to spin up an EKS cluster via `eksctl` we will need to define two new policies, with which we will associate our EKS admin user. The policy definitions are available in policy documents stored in this repositories. To access them, clone the repository first (from your AWS cloud shell session):
238 |
239 | ```
240 | git clone https://github.com/scie-nz/walden.git
241 | cd walden
242 | ```
243 |
244 | We will preserve the account identifier for future use:
245 |
246 | ```
247 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
248 | echo $AWS_ACCOUNT_ID
249 | ```
250 |
251 | We can now render the policies, i.e. by substituting the `AWS_ACCOUNT_ID` we just captured:
252 | ```
253 | cat aws/eks-all-access.json | envsubst > eks-all-access-rendered.json
254 | cat aws/iam-limited-access.json | envsubst > iam-limited-access-rendered.json
255 | ```
256 |
257 | Then we will create our policies:
258 | ```
259 | aws iam create-policy --policy-name EKSAllAccess --policy-document file://eks-all-access-rendered.json
260 | aws iam create-policy --policy-name IAMLimitedAccess --policy-document file://iam-limited-access-rendered.json
261 | ```
262 |
263 | 3. Create IAM EKS Admin group
264 |
265 | First we create our group and add our EKS admin user to it:
266 |
267 | ```
268 | aws iam create-group --group-name EKSAdmins
269 | aws iam add-user-to-group --group-name EKSAdmins --user-name eksadmin
270 | ```
271 |
272 | We are then ready to attach permissions to the group:
273 |
274 | ```
275 | aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::$AWS_ACCOUNT_ID:policy/EKSAllAccess
276 | aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::$AWS_ACCOUNT_ID:policy/IAMLimitedAccess
277 | aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess
278 | aws iam attach-group-policy --group-name EKSAdmins --policy-arn arn:aws:iam::aws:policy/AWSCloudFormationFullAccess
279 | ```
280 |
281 | You are now ready to interact with the cluster using your `eksadmin` user. The rest of the interactions will occurr via your regular machine, rather than a cloud shell instance.
282 |
283 | #### Deploy EKS cluster
284 |
285 | The following set of operations are meant to run on your home machine (Mac or Linux only).
286 |
287 | To provision this cluster you will need to have the following software installed:
288 | - the AWS [cli](https://aws.amazon.com/cli/)
289 | - [eksctl](https://docs.aws.amazon.com/eks/latest/userguide/eksctl.html)
290 | - [kubectl](https://kubernetes.io/docs/tasks/tools/)
291 |
292 | 1. Set up your AWS CLI authentication
293 |
294 | Edit `~/.aws/config` and add your AWS access key ID and secret (your access key ID starts with `AKIA`):
295 |
296 | ```
297 | [walden]
298 | aws_access_key_id=[YOUR ACCESS KEY ID GOES HERE]
299 | aws_secret_access_key=[YOUR ACCESS SECRET GOES HERE]
300 | ```
301 |
302 | Then run:
303 | ```
304 | export AWS_PROFILE=walden
305 | aws sts get-caller-identity
306 | ```
307 |
308 | You should see something like:
309 | ```
310 | {
311 | "UserId": "AIDA**********",
312 | "Account": "1234567890",
313 | "Arn": "arn:aws:iam::1234567890:user/[someusername]"
314 | }
315 | ```
316 |
317 | 2. Create minimal EKS cluster using `eksctl`
318 |
319 | To create a minimal cluster, run:
320 | ```
321 | eksctl create cluster --name=eks-walden --nodes=4 --node-type=r5.large --spot
322 | ```
323 |
324 | This command will create an EKS cluster, and one default node group with 4 nodes in it. This is purely a test cluster -- `eksctl` being a very powerful tool that allows you to customize your cluster whichever way you see fit.
325 |
326 | The command will take about 30 minutes to run, while AWS provisions requisite resources.
327 |
328 | Once the cluster creation has succeeded, run:
329 | ```
330 | kubectl get nodes
331 | ```
332 |
333 | #### Deploy Walden
334 |
335 | You now have a working EKS cluster, on which you can deploy Walden just as you would on an on-premise cluster. Follow [these instructions](https://github.com/scie-nz/walden#deploy-walden) to deploy it.
336 |
337 | #### Clean up
338 |
339 | First, get the name of your nodegroup:
340 | ```
341 | eksctl get nodegroup --cluster eks-walden
342 | ```
343 |
344 | Then, delete the nodegroup:
345 | ```
346 | eksctl delete nodegroup [NODEGROUP NAME GOES HERE] --cluster eks-walden
347 | ```
348 |
349 | You can now delete your cluster:
350 | ```
351 | eksctl delete cluster eks-walden
352 | ```
353 |
354 | Finally, you should clean up your EBS volumes. You can do so by visiting the [Volumes](https://us-west-2.console.aws.amazon.com/ec2/v2/home?#Volumes) section in your AWS console.
355 |
356 | NOTE: please take care when cleaning your EBS volumes. You may lose data you care about. Make sure you understand what volumes you're deleting.
357 |
358 | ### Azure
359 |
360 | This tutorial assumes you have a working Azure account, with default quota settings. You will likely need to activate pay-as-you-go billing to be able to provision the AKS cluster described here.
361 |
362 | #### Create AKS cluster
363 |
364 | First, create a dedicated resource group, this example uses the `centralus` region:
365 | ```
366 | az group create --name WaldenResourceGroup -l centralus
367 | ```
368 |
369 | You are now ready to create your cluster:
370 | ```
371 | az aks create -g WaldenResourceGroup -n WaldenAKS --node-count 5 --node-vm-size Standard_B2ms
372 | ```
373 |
374 | To connect to the cluster:
375 | ```
376 | az aks get-credentials --resource-group WaldenResourceGroup --name WaldenAKS
377 | ```
378 |
379 | #### Deploy Walden
380 |
381 | You now have a working AKS cluster, on which you can deploy Walden just as you would on an on-premise cluster. Follow [these instructions](https://github.com/scie-nz/walden#deploy-walden) to deploy it.
382 |
383 | #### Clean up
384 |
385 | First, delete the cluster:
386 | ```
387 | az aks delete --resource-group WaldenResourceGroup --name walden
388 | ```
389 |
390 | You can now delete the resource group:
391 | ```
392 | az group delete --resource-group WaldenResourceGroup
393 | ```
394 |
395 | ## Advanced topics
396 |
397 | ### Adding external data sources via Trino
398 |
399 | External databases can be added to Walden by [connecting them to Trino](https://trino.io/docs/current/connector.html) as a separate "Catalog". The new Trino Catalog can then be [added to Superset](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers).
400 |
401 | This strategy allows using both Trino and Superset to interact with the external data. However, some data types (such as GIS geometry columns) may not work well with Trino. In those cases you can instead connect Superset to the external database directly as described in the next section below.
402 |
403 | 1. Create a separate Kubernetes `ConfigMap` named `trino-catalog-extra` in the `walden` namespace. This `ConfigMap` should contain one or more `.properties` files for each [Trino connector](https://trino.io/docs/current/connector.html) that you want.
404 | 2. If Walden is already deployed, restart the `trino-*` pods manually for the change to take effect.
405 | ```
406 | $ kubectl delete pod -n walden trino-coordinator-xxxx-yyyy trino-worker-xxxx-yyyy
407 | ```
408 | 3. Verify that the external data source is accessible by logging in to the `devserver` pod as described above and checking for a data source named `custom`.
409 | ```
410 | $ kubectl exec -it -n walden deployment/devserver -- /bin/bash
411 | # trino-cli --server trino --catalog custom
412 | trino> SHOW SCHEMAS;
413 | trino> DESCRIBE .;
414 | ```
415 |
416 | Now we should be able to add the new Trino catalog to Superset:
417 |
418 | 1. Open the Superset UI and log in as described above.
419 | ```
420 | $ kubectl get secret -n walden superset-admin -o 'jsonpath={.data.pass}' | base64 -d && echo
421 | lONGpASSWoRD64HERE
422 | $ kubectl port-forward -n walden deployment/superset 8088
423 | ```
424 | 2. Go to `Data` > `Databases` via the top menu and click the `+ Database` on the upper right to add a new Database.
425 | 3. Select the `Trino` database type from the pull down menu.
426 | 4. Set the `SQLAlchemy URI` to `trino://trino/custom`.
427 | 5. (OPTIONAL) Switch to the `Advanced` tab and enable the following:
428 | - SQL Lab:
429 | - `Expose database in SQL Lab`, followed by...
430 | - `Allow Multi Schema Metadata Fetch` (optional: don't enable if the DB is very large)
431 | - `Enable query cost estimation`
432 | - `Allow this database to be explored`
433 | - Performance:
434 | - `Asynchronous query execution`
435 | 6. Click `Connect` to create the new Database entry.
436 |
437 | The new Database entry can be reconfigured again later if needed.
438 |
439 | Now you can switch to `SQL Lab` > `SQL Editor` and preview the new Database, confirming that it looks as expected.
440 |
441 | Check the [Trino](https://trino.io/docs/current/connector.html) and [Superset](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers) docs for any additional information on configuring particular database types.
442 |
443 | 
444 |
445 | ### Adding external data sources (and other CLI configuration) to Superset
446 |
447 | Instead of connecting an external database via Trino, the external database may instead be connected to Superset directly. This means the data will only be accessible via the Superset UI, and will not be accessible via Trino.
448 |
449 | Follow the above steps for logging into the Superset UI and adding a new Database entry, except this time you should pick the type of database that you are adding, instead of Trino. The steps are otherwise similar. If your datatype isn't listed, you may need to build a custom `walden-superset` Docker image that installs the required python module(s).
450 |
451 | Check the [Superset docs](https://superset.apache.org/docs/connecting-to-databases/installing-database-drivers) for any additional information on configuring particular database types.
452 |
453 | If you wish to provide the additional datasources declaratively via a YAML file, you can do so with something like the following custom `ConfigMap`. The special `superset_init_custom.sh` script allows running your own custom CLI commands on Superset startup. The `superset-custom` `ConfigMap` will take effect after restarting the `superset` and `superset-worker` pods:
454 |
455 | ```
456 | apiVersion: v1
457 | kind: ConfigMap
458 | metadata:
459 | name: superset-custom
460 | namespace: walden
461 | data:
462 | superset_init_custom.sh: |
463 | superset import_datasources -p /app/pythonpath/superset_datasources_custom.yaml
464 | superset_datasources_custom.yaml: |
465 | databases:
466 | - name: my_database
467 | ...config here...
468 | ```
469 |
470 | ### Adding/overriding superset_config.py configuration
471 |
472 | The provided `superset_config.py` provides a reasonable base configuration for integrating with Walden, using Postgres as the metastore and Redis as the cache.
473 |
474 | You may want to customize this configuration, for example to configure a custom authentication provider. To do this, create your own `ConfigMap` named `superset-custom` which contains your own `superset_config.py`, and/or any other files that should be included in the same directory as `superset_config.py`. The content of your custom `superset_config.py` will be concatenated to the end of the [default Walden `superset_config.py`](kube/configs/superset_config.py), and any additional files you provide will be copied into the same directory.
475 |
476 | Here is a minimal example of configuring custom additions to `superset_config.py`, which will take effect after restarting the `superset` and `superset-worker` pods. This can be combined with the above example of running custom superset CLI commands on pod startup:
477 |
478 | ```
479 | apiVersion: v1
480 | kind: ConfigMap
481 | metadata:
482 | name: superset-custom
483 | namespace: walden
484 | data:
485 | superset_config.py: |
486 | print("hello world! this is a custom config")
487 | custom_sso_security_manager.py: |
488 | # for example, this could have your custom SupersetSecurityManager implementation
489 | # see docs: https://superset.apache.org/docs/installation/configuring-superset/#custom-oauth2-configuration
490 | ```
491 |
492 | Similarly, if you need to provide some credentials for your config, they can be specified in a separate `Secret` that's also named `superset-custom`:
493 |
494 | ```
495 | apiVersion: v1
496 | kind: Secret
497 | metadata:
498 | name: superset-custom
499 | namespace: walden
500 | stringData:
501 | oauth_secrets.json: |
502 | { ... secret keys here ... }
503 | ```
504 |
505 | ### Building images using Kaniko
506 |
507 | Cheat sheet for building images from within an existing cluster.
508 | This can also be done locally via the Docker CLI or similar.
509 | ```
510 | kubectl create secret -n walden docker-registry regcred --docker-server=https://index.docker.io/v1/ --docker-username=[your-docker-username] --docker-password=[your-docker-password]
511 | kubectl apply -f kube-build/templates/kaniko-devserver.yaml
512 | ```
513 |
514 | After building/pushing new release images, update the tags for the affected `image_*` defaults in [`tf/variables.tf`](tf/variables.tf).
515 |
516 | ### Deploying with custom images
517 |
518 | Walden can be deployed with custom images from your registry/organization.
519 |
520 | 1. Create your own `terraform.tfvars` file with custom overrides for the `image_*` values listed under [`tf/variables.tf`](tf/variables.tf)
521 | 2. Build and push images: Run `docker/*/build.sh` and `docker/*/push.sh`
522 | 3. Deploy using custom images: Run `tf apply` under the `tf/` directory
523 |
524 | ### Deploying more MinIO nodes
525 |
526 | MinIO must be deployed with at least four nodes, which is the default number used by Walden.
527 | If you'd like to deploy more MinIO nodes, create a `terraform.tfvars` file with a custom override of the `minio_replicas` setting, then apply using `tf apply` under the `tf/` directory.
528 |
529 | ### Deploying MinIO on alternate architectures
530 |
531 | The MinIO images are multi-arch and so can be configured to run on nodes with non-`amd64` architectures.
532 | In our case, we have a mixed-architecture cluster where several `arm64` Raspberry Pis provide local storage, making them a convenient place for running the MinIO pods.
533 | To deploy with MinIO nodes on a different architecture, edit the `minio_arch` setting in your `terraform.tfvars` file.
534 | Note that we do not support custom architectures for the `walden-*` images themselves, as the underlying software doesn't deal with it well.
535 |
--------------------------------------------------------------------------------