├── .gitignore
├── infra
    ├── local
    │   └── kind.yaml
    ├── eks
    │   ├── outputs.tf
    │   ├── locals.tf
    │   ├── versions.tf
    │   ├── inflate.yaml
    │   ├── vpc.tf
    │   ├── main.tf
    │   ├── eks.tf
    │   └── karpenter.tf
    ├── irsa
    │   ├── s3.json
    │   └── trust-relationship.json
    └── prometheus
    │   ├── service.yaml
    │   ├── prometheus_instance.yaml
    │   ├── service_monitor_spark_jobs.yaml
    │   ├── service_monitor.yaml
    │   ├── pod-monitor.yaml
    │   └── prometheus_rbac.yaml
├── Dockerfile
├── argo_apps
    └── spark-operator
    │   └── app.yaml
├── jobs
    ├── hello-custom-iceberg-image.yaml
    ├── pi.yaml
    ├── taxi-job.yaml
    ├── taxi-job-with-node-affinity.yaml
    ├── test-iceberg.py
    ├── taxi-job-with-pod-affinity.yaml
    └── config
    │   └── spark-metrics-cfg-map.yaml
├── Dockerfile.base
├── entrypoint.sh
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .terraform.lock.hcl
3 | .terraform/
4 | terraform.tfstate
5 | terraform.tfstate.backup
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/infra/local/kind.yaml:
--------------------------------------------------------------------------------
1 | kind: Cluster
2 | apiVersion: kind.x-k8s.io/v1alpha4
3 | nodes:
4 | - role: control-plane
5 | - role: worker
6 | - role: worker
7 | 


--------------------------------------------------------------------------------
/infra/eks/outputs.tf:
--------------------------------------------------------------------------------
1 | output "karpenter_queue_name" {
2 |   description = "The name of the created Amazon SQS queue"
3 |   value       = module.karpenter.queue_name
4 | }


--------------------------------------------------------------------------------
/infra/irsa/s3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": "s3:*",
 7 |             "Resource": [
 8 |                 "arn:aws:s3:::yt-lakehouse",
 9 |                 "arn:aws:s3:::yt-lakehouse/*"
10 |             ]
11 |         }
12 |     ]
13 | }


--------------------------------------------------------------------------------
/infra/eks/locals.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   name   = "eks-data"
 3 |   region = "us-east-1"
 4 | 
 5 |   vpc_cidr = "10.0.0.0/16"
 6 |   azs      = slice(data.aws_availability_zones.available.names, 0, 3)
 7 | 
 8 |   tags = {
 9 |     Example    = local.name
10 |     GithubRepo = "terraform-aws-eks"
11 |     GithubOrg  = "terraform-aws-modules"
12 |   }
13 | }


--------------------------------------------------------------------------------
/infra/prometheus/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: spark-metrics-svc
 5 |   namespace: spark-jobs
 6 |   labels:
 7 |     app: spark-job
 8 | spec:
 9 |   selector:
10 |     type: spark-job
11 |   ports:
12 |     - name: metrics
13 |       port: 8090
14 |       targetPort: 8090
15 |   selector:
16 |     type: spark-job


--------------------------------------------------------------------------------
/infra/eks/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = ">= 5.74"
 6 |     }
 7 |     helm = {
 8 |       source  = "hashicorp/helm"
 9 |       version = ">= 2.7"
10 |     }
11 |     kubectl = {
12 |       source  = "alekc/kubectl"
13 |       version = ">= 2.0"
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/infra/prometheus/prometheus_instance.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: Prometheus
 3 | metadata:
 4 |   name: prometheus
 5 | spec:
 6 |   serviceAccountName: prometheus
 7 |   serviceMonitorSelector: {}
 8 |   serviceMonitorNamespaceSelector:
 9 |     matchLabels:
10 |       - spark-jobs
11 |   resources:
12 |     requests:
13 |       memory: 300Mi
14 | 


--------------------------------------------------------------------------------
/infra/prometheus/service_monitor_spark_jobs.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: ServiceMonitor
 3 | metadata:
 4 |   name: spark-servicemonitor
 5 |   #namespace: spark-jobs
 6 | spec:
 7 |   namespaceSelector:
 8 |     matchNames:
 9 |       - spark-jobs
10 |   selector:
11 |     matchLabels:
12 |       #sparkoperator.k8s.io/launched-by-spark-operator: true
13 |       type: spark-job
14 |   endpoints:
15 |     - port: "metrics"
16 |       # path: /metrics/prometheus
17 |       # interval: 5s


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image with Spark
 2 | FROM tiagotxm/spark:3.5.3-java12-hadoop3
 3 | 
 4 | # Define variables
 5 | ENV SPARK_HOME=/opt/spark
 6 | ENV PATH="$SPARK_HOME/bin:$PATH"
 7 | ENV ICEBERG_VERSION=1.7.0
 8 | 
 9 | # Add Iceberg JARs
10 | RUN mkdir -p $SPARK_HOME/jars \
11 |     && curl -L -o $SPARK_HOME/jars/iceberg-spark-runtime.jar \
12 |        https://repo1.maven.org/maven2/org/apache/iceberg//iceberg-spark-3.5_2.12/$ICEBERG_VERSION/iceberg-spark-3.5_2.12-$ICEBERG_VERSION.jar
13 | 
14 | # Set entrypoint
15 | ENTRYPOINT [ "/opt/spark/entrypoint.sh" ]
16 | 


--------------------------------------------------------------------------------
/argo_apps/spark-operator/app.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Application
 3 | metadata:
 4 |   name: spark-operator
 5 |   namespace: argocd
 6 | spec:
 7 |   project: default
 8 |   source:
 9 |     repoURL: 'git@github.com:tiagotxm/charts.git'
10 |     path: spark-operator
11 |     targetRevision: HEAD
12 |   destination:
13 |       namespace: spark-operator
14 |       server: 'https://kubernetes.default.svc'
15 |   syncPolicy:
16 |     syncOptions:
17 |       - CreateNamespace=true
18 |       - ServerSideApply=true
19 |     automated:
20 |       prune: true
21 |       selfHeal: true


--------------------------------------------------------------------------------
/infra/prometheus/service_monitor.yaml:
--------------------------------------------------------------------------------
 1 | # apiVersion: monitoring.coreos.com/v1
 2 | # kind: ServiceMonitor
 3 | # metadata:
 4 | #   labels:
 5 | #     name: prometheus
 6 | #   name: svc-monitor-spark-controller
 7 | #   namespace: default
 8 | # spec:
 9 | #   endpoints:
10 | #   - interval: 30s
11 | #     targetPort: metrics
12 | #     path: /metrics
13 | #     # scheme: https
14 | #     # tlsConfig:
15 | #     #     insecureSkipVerify: true
16 | #   namespaceSelector:
17 | #     matchNames:
18 | #       - spark-operator
19 | #   selector:
20 | #     matchLabels:
21 | #       app.kubernetes.io/instance: spark-operator


--------------------------------------------------------------------------------
/infra/eks/inflate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: inflate
 5 | spec:
 6 |   replicas: 0
 7 |   selector:
 8 |     matchLabels:
 9 |       app: inflate
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: inflate
14 |     spec:
15 |       terminationGracePeriodSeconds: 0
16 |       securityContext:
17 |         runAsUser: 1000
18 |         runAsGroup: 3000
19 |         fsGroup: 2000
20 |       containers:
21 |       - name: inflate
22 |         image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
23 |         resources:
24 |           requests:
25 |             cpu: 1
26 |         securityContext:
27 |           allowPrivilegeEscalation: false


--------------------------------------------------------------------------------
/infra/irsa/trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Federated": "arn:aws:iam::763828354275:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/8DAB35CD413CD55FED7AD37ACF3B2311"
 8 |       },
 9 |       "Action": "sts:AssumeRoleWithWebIdentity",
10 |       "Condition": {
11 |         "StringEquals": {
12 |           "oidc.eks.us-east-1.amazonaws.com/id/8DAB35CD413CD55FED7AD37ACF3B2311:aud": "sts.amazonaws.com",
13 |           "oidc.eks.us-east-1.amazonaws.com/id/8DAB35CD413CD55FED7AD37ACF3B2311:sub": "system:serviceaccount:spark-jobs:spark-operator-spark"
14 |         }
15 |       }
16 |     }
17 |   ]
18 | }
19 | 


--------------------------------------------------------------------------------
/jobs/hello-custom-iceberg-image.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: hello-iceberg-image
 5 |   namespace: default
 6 | spec:
 7 |   type: Python
 8 |   mode: cluster
 9 |   image: "tiagotxm/spark:3.5.3-iceberg"
10 |   imagePullPolicy: IfNotPresent
11 |   mainClass: org.apache.spark.examples.SparkPi
12 |   mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.5.3.jar"
13 |   sparkVersion: "3.5.3"
14 |   restartPolicy:
15 |     type: Never
16 |   driver:
17 |     cores: 1
18 |     coreLimit: "1200m"
19 |     memory: "512m"
20 |     labels:
21 |       version: 3.5.3
22 |     serviceAccount: spark-operator-spark
23 |   executor:
24 |     cores: 1
25 |     instances: 1
26 |     memory: "512m"
27 |     labels:
28 |       version: 3.5.3


--------------------------------------------------------------------------------
/infra/eks/vpc.tf:
--------------------------------------------------------------------------------
 1 | module "vpc" {
 2 |   source  = "terraform-aws-modules/vpc/aws"
 3 |   version = "5.13.0"
 4 | 
 5 |   name = "vpc-${local.name}"
 6 |   cidr = local.vpc_cidr
 7 | 
 8 |   azs             = local.azs
 9 |   private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)]
10 |   public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)]
11 |   intra_subnets   = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 52)]
12 | 
13 |   enable_nat_gateway = true
14 |   single_nat_gateway = true
15 | 
16 |   public_subnet_tags = {
17 |     "kubernetes.io/role/elb" = 1
18 |   }
19 | 
20 |   private_subnet_tags = {
21 |     "kubernetes.io/role/internal-elb" = 1
22 |     # Tags subnets for Karpenter auto-discovery
23 |     "karpenter.sh/discovery" = local.name
24 |   }
25 | 
26 |   tags = local.tags
27 |  
28 | }


--------------------------------------------------------------------------------
/infra/prometheus/pod-monitor.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PodMonitor
 3 | metadata:
 4 |   name: podmonitor-spark-job
 5 |   namespace: default
 6 | spec:
 7 |   namespaceSelector:
 8 |     matchNames:
 9 |     - spark-jobs
10 |   selector:
11 |     matchLabels:
12 |       monitored-by: prometheus
13 |   podMetricsEndpoints:
14 |   - port: "jmx-exporter"
15 |     path: "/metrics"
16 |   - port: "spark-ui"
17 |     path: "/metrics/prometheus"
18 |     # metricRelabelings:
19 |     # - regex: (metrics_)(.*)(_driver)(.*)
20 |     #   replacement: bridge$3$4
21 |     #   sourceLabels: [ __name__ ]
22 |     #   targetLabel: __name__
23 |   - port: "spark-ui"
24 |     path: "/metrics/executors/prometheus"
25 |     # metricRelabelings:
26 |     # - regex: (metrics)(.*)
27 |     #   replacement: bridge$2
28 |     #   sourceLabels: [ __name__ ]
29 |     #   targetLabel: __name__
30 | 


--------------------------------------------------------------------------------
/infra/prometheus/prometheus_rbac.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: prometheus
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: ClusterRole
 8 | metadata:
 9 |   name: prometheus
10 | rules:
11 | - apiGroups: [""]
12 |   resources:
13 |   - nodes
14 |   - nodes/metrics
15 |   - services
16 |   - endpoints
17 |   - pods
18 |   verbs: ["get", "list", "watch"]
19 | - apiGroups: [""]
20 |   resources:
21 |   - configmaps
22 |   verbs: ["get"]
23 | - apiGroups:
24 |   - networking.k8s.io
25 |   resources:
26 |   - ingresses
27 |   verbs: ["get", "list", "watch"]
28 | - nonResourceURLs: ["/metrics"]
29 |   verbs: ["get"]
30 | ---
31 | apiVersion: rbac.authorization.k8s.io/v1
32 | kind: ClusterRoleBinding
33 | metadata:
34 |   name: prometheus
35 | roleRef:
36 |   apiGroup: rbac.authorization.k8s.io
37 |   kind: ClusterRole
38 |   name: prometheus
39 | subjects:
40 | - kind: ServiceAccount
41 |   name: prometheus
42 |   namespace: default


--------------------------------------------------------------------------------
/jobs/pi.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: spark-pi
 5 |   namespace: default
 6 | spec:
 7 |   type: Python
 8 |   mode: cluster
 9 |   image: "apache/spark-py:latest"
10 |   imagePullPolicy: Always
11 |   mainApplicationFile: "local:///opt/spark/examples/src/main/python/pi.py"
12 |   sparkVersion: "3.5.3"
13 |   restartPolicy:
14 |     type: Never
15 |   volumes:
16 |     - name: "test-volume"
17 |       hostPath:
18 |         path: "/tmp"
19 |         type: Directory
20 |   driver:
21 |     cores: 1
22 |     coreLimit: "1200m"
23 |     memory: "512m"
24 |     labels:
25 |       version: 3.5.3
26 |     serviceAccount: spark-operator-spark
27 |     volumeMounts:
28 |       - name: "test-volume"
29 |         mountPath: "/tmp"
30 |   executor:
31 |     cores: 1
32 |     instances: 1
33 |     memory: "512m"
34 |     labels:
35 |       version: 3.5.3
36 |     volumeMounts:
37 |       - name: "test-volume"
38 |         mountPath: "/tmp"


--------------------------------------------------------------------------------
/jobs/taxi-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: sparkoperator.k8s.io/v1beta2
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: taxi-job
 5 |   namespace: spark-jobs
 6 | spec:
 7 |   deps:
 8 |     packages:
 9 |       - org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.0
10 |   type: Python
11 |   mode: cluster
12 |   image: "tiagotxm/spark:3.5.3-hadoop-aws3.3.4-sdk1.12.262"
13 |   imagePullPolicy: Always
14 |   mainApplicationFile: "s3a://yt-lakehouse/scripts/test-iceberg.py"
15 |   sparkConf:
16 |     spark.hadoop.fs.s3a.path.style.access: "True"
17 |     spark.hadoop.fs.s3a.fast.upload: "True"
18 |     spark.hadoop.fs.s3a.multipart.size: "104857600"
19 |     fs.s3a.connection.maximum: "100"
20 |     spark.hadoop.fs.s3.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
21 |     spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
22 |     spark.hadoop.fs.s3a.aws.credentials.provider: "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
23 |   sparkVersion: 3.5.3
24 |   restartPolicy:
25 |     type: Never
26 |   driver:
27 |     cores: 1
28 |     memory: 512m
29 |     labels:
30 |       version: 3.5.3
31 |     serviceAccount: spark-operator-spark      
32 |   executor:
33 |     cores: 1
34 |     instances: 1
35 |     memory: 512m
36 |     labels:
37 |       version: 3.5.3
38 | 


--------------------------------------------------------------------------------
/infra/eks/main.tf:
--------------------------------------------------------------------------------
 1 | provider "aws" {
 2 |   region = local.region
 3 | }
 4 | 
 5 | provider "aws" {
 6 |   region = "us-east-1"
 7 |   alias  = "virginia"
 8 | }
 9 | 
10 | provider "helm" {
11 |   kubernetes {
12 |     host                   = module.eks.cluster_endpoint
13 |     cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
14 | 
15 |     exec {
16 |       api_version = "client.authentication.k8s.io/v1beta1"
17 |       command     = "aws"
18 |       # This requires the awscli to be installed locally where Terraform is executed
19 |       args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
20 |     }
21 |   }
22 | }
23 | 
24 | provider "kubectl" {
25 |   apply_retry_count      = 5
26 |   host                   = module.eks.cluster_endpoint
27 |   cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
28 |   load_config_file       = false
29 | 
30 |   exec {
31 |     api_version = "client.authentication.k8s.io/v1beta1"
32 |     command     = "aws"
33 |     # This requires the awscli to be installed locally where Terraform is executed
34 |     args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
35 |   }
36 | }
37 | 
38 | data "aws_availability_zones" "available" {}
39 | data "aws_ecrpublic_authorization_token" "token" {
40 |   provider = aws.virginia
41 | }


--------------------------------------------------------------------------------
/infra/eks/eks.tf:
--------------------------------------------------------------------------------
 1 | module "eks" {
 2 |   source  = "terraform-aws-modules/eks/aws"
 3 |   version = "20.25.0"
 4 | 
 5 |   cluster_name    = local.name
 6 |   cluster_version = "1.30"
 7 | 
 8 |   cluster_endpoint_public_access  = true
 9 |   enable_cluster_creator_admin_permissions = true
10 | 
11 |   cluster_addons = {
12 |     coredns                = {}
13 |     eks-pod-identity-agent = {}
14 |     kube-proxy             = {}
15 |     vpc-cni                = {}
16 |   }
17 | 
18 |   vpc_id                   = module.vpc.vpc_id
19 |   subnet_ids               = module.vpc.private_subnets
20 |   control_plane_subnet_ids = module.vpc.intra_subnets
21 | 
22 |   eks_managed_node_groups = {
23 |     karpenter = {
24 |       ami_type       = "AL2023_x86_64_STANDARD"
25 |       instance_types = ["m5.large"]
26 | 
27 |       min_size     = 2
28 |       max_size     = 5
29 |       desired_size = 2
30 | 
31 |       taints = {
32 |         # This Taint aims to keep just EKS Addons and Karpenter running on this MNG
33 |         # The pods that do not tolerate this taint should run on nodes created by Karpenter
34 |         addons = {
35 |           key    = "CriticalAddonsOnly"
36 |           value  = "true"
37 |           effect = "NO_SCHEDULE"
38 |         },
39 |       }
40 |     }
41 |   }
42 | 
43 |   node_security_group_tags = merge(local.tags, {
44 |     # NOTE - if creating multiple security groups with this module, only tag the
45 |     # security group that Karpenter should utilize with the following tag
46 |     # (i.e. - at most, only one security group should have this tag in your account)
47 |     "karpenter.sh/discovery" = local.name
48 |   })
49 | 
50 |   tags = local.tags
51 | }
52 | 


--------------------------------------------------------------------------------
/jobs/taxi-job-with-node-affinity.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: sparkoperator.k8s.io/v1beta2
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: taxi-with-node-affinity-2
 5 |   namespace: spark-jobs
 6 | spec:
 7 |   deps:
 8 |     packages:
 9 |       - org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.0
10 |   type: Python
11 |   mode: cluster
12 |   image: "tiagotxm/spark:3.5.3-hadoop-aws3.3.4-sdk1.12.262"
13 |   imagePullPolicy: Always
14 |   mainApplicationFile: "s3a://yt-lakehouse/scripts/test-iceberg.py"
15 |   sparkConf:
16 |     spark.hadoop.fs.s3a.path.style.access: "True"
17 |     spark.hadoop.fs.s3a.fast.upload: "True"
18 |     spark.hadoop.fs.s3a.multipart.size: "104857600"
19 |     fs.s3a.connection.maximum: "100"
20 |     spark.hadoop.fs.s3.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
21 |     spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
22 |     spark.hadoop.fs.s3a.aws.credentials.provider: "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
23 |   sparkVersion: 3.5.3
24 |   restartPolicy:
25 |     type: Never
26 |   driver:
27 |     cores: 1
28 |     memory: 16g
29 |     labels:
30 |       version: 3.5.3
31 |       job_name: taxi-with-node-affinity
32 |     serviceAccount: spark-operator-spark
33 |     affinity:
34 |       nodeAffinity:
35 |         requiredDuringSchedulingIgnoredDuringExecution:
36 |           nodeSelectorTerms:
37 |             - matchExpressions:
38 |                 - key: 'karpenter.sh/capacity-type'
39 |                   operator: In
40 |                   values:
41 |                     - on-demand
42 |       
43 |   executor:
44 |     cores: 1
45 |     instances: 10
46 |     memory: 16gb
47 |     labels:
48 |       version: 3.5.3
49 |     affinity:
50 |       nodeAffinity:
51 |         requiredDuringSchedulingIgnoredDuringExecution:
52 |           nodeSelectorTerms:
53 |             - matchExpressions:
54 |                 - key: 'karpenter.sh/capacity-type'
55 |                   operator: In
56 |                   values:
57 |                     - spot
58 |     


--------------------------------------------------------------------------------
/jobs/test-iceberg.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import DoubleType, FloatType, LongType, StructType, StructField, StringType
 3 | import os
 4 | 
 5 | def create_spark_session():
 6 |     """Create and configure the Spark session."""
 7 |     warehouse_path = os.getenv("WAREHOUSE_PATH", "s3a://yt-lakehouse/warehouse")
 8 |     
 9 |     spark = (
10 |         SparkSession.builder
11 |         .appName("IcebergSparkSession")
12 |         .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
13 |         .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
14 |         .config("spark.sql.catalog.spark_catalog.type", "hive")
15 |         .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
16 |         .config("spark.sql.catalog.local.type", "hadoop")
17 |         .config("spark.sql.catalog.local.warehouse", warehouse_path)
18 |         .config("spark.sql.defaultCatalog", "local")
19 |         .getOrCreate()
20 |     )
21 |     return spark
22 | 
23 | def main():
24 |     spark = create_spark_session()
25 |     
26 |     schema = StructType([
27 |         StructField("vendor_id", LongType(), True),
28 |         StructField("trip_id", LongType(), True),
29 |         StructField("trip_distance", FloatType(), True),
30 |         StructField("fare_amount", DoubleType(), True),
31 |         StructField("store_and_fwd_flag", StringType(), True)
32 |     ])
33 |     
34 |     df = spark.createDataFrame([], schema)
35 |     df.write.format("iceberg").mode("overwrite").save("demo.nyc.taxis")
36 |     
37 |     data = [
38 |         (1, 1000371, 1.8, 15.32, "N"),
39 |         (2, 1000372, 2.5, 22.15, "N"),
40 |         (2, 1000373, 0.9, 9.01, "N"),
41 |         (1, 1000374, 8.4, 42.13, "Y")
42 |     ]
43 |     
44 |     schema = spark.table("demo.nyc.taxis").schema
45 |     df = spark.createDataFrame(data, schema)
46 |     df.writeTo("demo.nyc.taxis").append()
47 |     
48 |     spark.table("demo.nyc.taxis").show()
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/jobs/taxi-job-with-pod-affinity.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: sparkoperator.k8s.io/v1beta2
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: taxi-with-node-affinity2
 5 |   namespace: spark-jobs
 6 | spec:
 7 |   deps:
 8 |     packages:
 9 |       - org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.0
10 |   type: Python
11 |   mode: cluster
12 |   image: "tiagotxm/spark:3.5.3-hadoop-aws3.3.4-sdk1.12.262"
13 |   imagePullPolicy: Always
14 |   mainApplicationFile: "s3a://yt-lakehouse/scripts/test-iceberg.py"
15 |   sparkConf:
16 |     spark.hadoop.fs.s3a.path.style.access: "True"
17 |     spark.hadoop.fs.s3a.fast.upload: "True"
18 |     spark.hadoop.fs.s3a.multipart.size: "104857600"
19 |     fs.s3a.connection.maximum: "100"
20 |     spark.hadoop.fs.s3.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
21 |     spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
22 |     spark.hadoop.fs.s3a.aws.credentials.provider: "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
23 |   sparkVersion: 3.5.3
24 |   restartPolicy:
25 |     type: Never
26 |   driver:
27 |     cores: 1
28 |     memory: 16g
29 |     labels:
30 |       version: 3.5.3
31 |       job_name: taxi-with-node-affinity
32 |     serviceAccount: spark-operator-spark
33 |     affinity:
34 |       nodeAffinity:
35 |         requiredDuringSchedulingIgnoredDuringExecution:
36 |           nodeSelectorTerms:
37 |             - matchExpressions:
38 |                 - key: 'karpenter.sh/capacity-type'
39 |                   operator: In
40 |                   values:
41 |                     - on-demand
42 |       podAffinity:
43 |         requiredDuringSchedulingIgnoredDuringExecution:
44 |           - labelSelector:
45 |               matchExpressions:
46 |                 - key: job_name
47 |                   operator: In
48 |                   values:
49 |                     - taxi-with-node-affinity
50 |             topologyKey: topology.kubernetes.io/zone 
51 |   executor:
52 |     cores: 1
53 |     instances: 1
54 |     memory: 16g
55 |     labels:
56 |       version: 3.5.3
57 |     affinity:
58 |       nodeAffinity:
59 |         requiredDuringSchedulingIgnoredDuringExecution:
60 |           nodeSelectorTerms:
61 |             - matchExpressions:
62 |                 - key: 'karpenter.sh/capacity-type'
63 |                   operator: In
64 |                   values:
65 |                     - spot
66 |       podAffinity:
67 |         requiredDuringSchedulingIgnoredDuringExecution:
68 |           - labelSelector:
69 |               matchExpressions:
70 |                 - key: job_name
71 |                   operator: In
72 |                   values:
73 |                     - taxi-with-node-affinity
74 |             topologyKey: topology.kubernetes.io/zone


--------------------------------------------------------------------------------
/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | 
 2 | ARG JRE_VERSION=11-jre-slim-buster
 3 | 
 4 | FROM openjdk:${JRE_VERSION} AS base
 5 | 
 6 | ARG SPARK_VERSION_DEFAULT=3.5.3
 7 | ARG HADOOP_VERSION_DEFAULT=3
 8 | ARG HADOOP_AWS_VERSION_DEFAULT=3.3.4
 9 | ARG AWS_SDK_BUNDLE_VERSION_DEFAULT=1.12.262
10 | 
11 | # Define ENV variables
12 | ENV SPARK_VERSION=${SPARK_VERSION_DEFAULT}
13 | ENV HADOOP_VERSION=${HADOOP_VERSION_DEFAULT}
14 | ENV HADOOP_AWS_VERSION=${HADOOP_AWS_VERSION_DEFAULT}
15 | ENV AWS_SDK_BUNDLE_VERSION=${AWS_SDK_BUNDLE_VERSION_DEFAULT}
16 | 
17 | RUN apt-get update \
18 |     && apt-get install -y curl bash tini libc6 libpam-modules krb5-user libnss3 procps \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | FROM base AS spark-base
22 | 
23 | # Download and extract Spark
24 | RUN curl -L https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -o spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
25 |     && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
26 |     && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark \
27 |     && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
28 | 
29 | COPY entrypoint.sh /opt/spark
30 | 
31 | RUN chmod a+x /opt/spark/entrypoint.sh
32 | 
33 | FROM spark-base AS sparkbuilder
34 | 
35 | # Set SPARK_HOME
36 | ENV SPARK_HOME=/opt/spark
37 | 
38 | # Extend PATH environment variable
39 | ENV PATH=${PATH}:${SPARK_HOME}/bin
40 | 
41 | # Create the application directory
42 | RUN mkdir -p /app
43 | 
44 | FROM sparkbuilder AS spark-with-s3
45 | 
46 | # Download S3 and GCS jars
47 | RUN curl -L https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar -o ${SPARK_HOME}/jars/hadoop-aws-${HADOOP_AWS_VERSION}.jar \
48 |     && curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE_VERSION}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE_VERSION}.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-${AWS_SDK_BUNDLE_VERSION}.jar \
49 |     && rm -rf /var/lib/apt/lists/*
50 | 
51 | FROM spark-with-s3 AS spark-with-python
52 | 
53 | ENV PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
54 | 
55 | RUN apt-get update -y \
56 |     && apt-get install -y --no-install-recommends python3 python3-pip \
57 |     && pip3 install --upgrade pip setuptools \
58 |     # Removed the .cache to save space
59 |     && rm -r /root/.cache \
60 |     && rm -rf /var/cache/apt/* \
61 |     && rm -rf /var/lib/apt/lists/*
62 | 
63 | WORKDIR /app
64 | 
65 | # ADD requirements.txt .
66 | 
67 | # Add application files
68 | # ADD . .
69 | 
70 | # Install application specific python dependencies
71 | # RUN pip3 install -r requirements.txt
72 | 
73 | USER root
74 | 
75 | ENTRYPOINT [ "/opt/spark/entrypoint.sh" ]
76 | 


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | # echo commands to the terminal output
 20 | set -ex
 21 | 
 22 | # Check whether there is a passwd entry for the container UID
 23 | myuid=$(id -u)
 24 | mygid=$(id -g)
 25 | # turn off -e for getent because it will return error code in anonymous uid case
 26 | set +e
 27 | uidentry=$(getent passwd $myuid)
 28 | set -e
 29 | 
 30 | # If there is no passwd entry for the container UID, attempt to create one
 31 | if [ -z "$uidentry" ] ; then
 32 |     if [ -w /etc/passwd ] ; then
 33 | 	echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
 34 |     else
 35 | 	echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
 36 |     fi
 37 | fi
 38 | 
 39 | SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
 40 | env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
 41 | readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
 42 | 
 43 | if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
 44 |   SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
 45 | fi
 46 | 
 47 | if ! [ -z ${PYSPARK_PYTHON+x} ]; then
 48 |     export PYSPARK_PYTHON
 49 | fi
 50 | if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
 51 |     export PYSPARK_DRIVER_PYTHON
 52 | fi
 53 | 
 54 | # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
 55 | # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
 56 | if [ -n "${HADOOP_HOME}"  ] && [ -z "${SPARK_DIST_CLASSPATH}"  ]; then
 57 |   export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
 58 | fi
 59 | 
 60 | if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
 61 |   SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
 62 | fi
 63 | 
 64 | if ! [ -z ${SPARK_CONF_DIR+x} ]; then
 65 |   SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
 66 | elif ! [ -z ${SPARK_HOME+x} ]; then
 67 |   SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
 68 | fi
 69 | 
 70 | case "$1" in
 71 |   driver)
 72 |     shift 1
 73 |     CMD=(
 74 |       "$SPARK_HOME/bin/spark-submit"
 75 |       --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
 76 |       --deploy-mode client
 77 |       "$@"
 78 |     )
 79 |     ;;
 80 |   executor)
 81 |     shift 1
 82 |     CMD=(
 83 |       ${JAVA_HOME}/bin/java
 84 |       "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
 85 |       -Xms$SPARK_EXECUTOR_MEMORY
 86 |       -Xmx$SPARK_EXECUTOR_MEMORY
 87 |       -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
 88 |       org.apache.spark.executor.CoarseGrainedExecutorBackend
 89 |       --driver-url $SPARK_DRIVER_URL
 90 |       --executor-id $SPARK_EXECUTOR_ID
 91 |       --cores $SPARK_EXECUTOR_CORES
 92 |       --app-id $SPARK_APPLICATION_ID
 93 |       --hostname $SPARK_EXECUTOR_POD_IP
 94 |       --resourceProfileId $SPARK_RESOURCE_PROFILE_ID
 95 |     )
 96 |     ;;
 97 | 
 98 |   *)
 99 |     echo "Non-spark-on-k8s command provided, proceeding in pass-through mode..."
100 |     CMD=("$@")
101 |     ;;
102 | esac
103 | 
104 | # Execute the container CMD under tini for better hygiene
105 | exec /usr/bin/tini -s -- "${CMD[@]}"


--------------------------------------------------------------------------------
/infra/eks/karpenter.tf:
--------------------------------------------------------------------------------
  1 | module "karpenter" {
  2 |   source = "terraform-aws-modules/eks/aws//modules/karpenter"
  3 | 
  4 |   cluster_name                    = module.eks.cluster_name
  5 |   enable_v1_permissions           = true
  6 |   enable_pod_identity             = true
  7 |   create_pod_identity_association = true
  8 | 
  9 |   # Used to attach additional IAM policies to the Karpenter node IAM role
 10 |   node_iam_role_additional_policies = {
 11 |     AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
 12 |   }
 13 | 
 14 |   tags = local.tags
 15 | }
 16 | 
 17 | 
 18 | resource "helm_release" "karpenter" {
 19 |   namespace           = "kube-system"
 20 |   name                = "karpenter"
 21 |   repository          = "oci://public.ecr.aws/karpenter"
 22 |   repository_username = data.aws_ecrpublic_authorization_token.token.user_name
 23 |   repository_password = data.aws_ecrpublic_authorization_token.token.password
 24 |   chart               = "karpenter"
 25 |   version             = "1.0.0"
 26 |   wait                = false
 27 | 
 28 |   values = [
 29 |     <<-EOT
 30 |     serviceAccount:
 31 |       name: ${module.karpenter.service_account}
 32 |     settings:
 33 |       clusterName: ${module.eks.cluster_name}
 34 |       clusterEndpoint: ${module.eks.cluster_endpoint}
 35 |       interruptionQueue: ${module.karpenter.queue_name}
 36 |     EOT
 37 |   ]
 38 | }
 39 | 
 40 | resource "kubectl_manifest" "karpenter_node_class" {
 41 |   yaml_body = <<-YAML
 42 |     apiVersion: karpenter.k8s.aws/v1
 43 |     kind: EC2NodeClass
 44 |     metadata:
 45 |       name: default
 46 |     spec:
 47 |       amiFamily: AL2023
 48 |       role: ${module.karpenter.node_iam_role_name}
 49 |       subnetSelectorTerms:
 50 |         - tags:
 51 |             karpenter.sh/discovery: ${module.eks.cluster_name}
 52 |       securityGroupSelectorTerms:
 53 |         - tags:
 54 |             karpenter.sh/discovery: ${module.eks.cluster_name}
 55 |       tags:
 56 |         karpenter.sh/discovery: ${module.eks.cluster_name}
 57 |       amiSelectorTerms:
 58 |         - alias: al2023@latest
 59 |   YAML
 60 | 
 61 |   depends_on = [
 62 |     helm_release.karpenter
 63 |   ]
 64 | }
 65 | 
 66 | resource "kubectl_manifest" "karpenter_node_pool" {
 67 |   yaml_body = <<-YAML
 68 |     apiVersion: karpenter.sh/v1
 69 |     kind: NodePool
 70 |     metadata:
 71 |       name: default
 72 |     spec:
 73 |       template:
 74 |         spec:
 75 |           nodeClassRef:
 76 |             name: default
 77 |             kind: EC2NodeClass
 78 |             group: karpenter.k8s.aws
 79 |           requirements:
 80 |             - key: "karpenter.k8s.aws/instance-category"
 81 |               operator: In
 82 |               values: ["c", "m", "r"]
 83 |             - key: "karpenter.k8s.aws/instance-cpu"
 84 |               operator: In
 85 |               values: ["4", "8", "16", "32"]
 86 |             - key: "karpenter.k8s.aws/instance-hypervisor"
 87 |               operator: In
 88 |               values: ["nitro"]
 89 |             - key: "karpenter.k8s.aws/instance-generation"
 90 |               operator: Gt
 91 |               values: ["2"]
 92 |       limits:
 93 |         cpu: 1000
 94 |       disruption:
 95 |         consolidationPolicy: WhenEmpty
 96 |         consolidateAfter: 30s
 97 |   YAML
 98 | 
 99 |   depends_on = [
100 |     kubectl_manifest.karpenter_node_class
101 |   ]
102 | }
103 | 
104 | resource "kubectl_manifest" "karpenter_spark_jobs_pool" {
105 |   yaml_body = <<-YAML
106 |     apiVersion: karpenter.sh/v1
107 |     kind: NodePool
108 |     metadata:
109 |       name: spark-jobs-pool
110 |     spec:
111 |       template:
112 |         spec:
113 |           nodeClassRef:
114 |             name: default
115 |             kind: EC2NodeClass
116 |             group: karpenter.k8s.aws
117 |           requirements:
118 |             - key: "karpenter.k8s.aws/instance-category"
119 |               operator: In
120 |               values: ["r"]
121 |             - key: "karpenter.k8s.aws/instance-hypervisor"
122 |               operator: In
123 |               values: ["nitro"]
124 |             - key: "karpenter.k8s.aws/instance-generation"
125 |               operator: Gt
126 |               values: ["2"]
127 |             - key: "kubernetes.io/arch"
128 |               operator: In
129 |               values: ["arm64"] #["arm64", "amd64"]
130 |             - key: "karpenter.sh/capacity-type"
131 |               operator: In
132 |               values: ["spot", "on-demand"]
133 | 
134 |       limits:
135 |         cpu: "1000"
136 |         memory: 1000Gi
137 | 
138 |   YAML
139 |   }


--------------------------------------------------------------------------------
/jobs/config/spark-metrics-cfg-map.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ConfigMap
  3 | metadata:
  4 |   name: spark-metrics-config
  5 |   namespace: spark-jobs
  6 | data:
  7 |   metrics.properties: |-
  8 |     *.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
  9 |     *.sink.prometheusServlet.path=/metrics/prometheus
 10 |     master.sink.prometheusServlet.path=/metrics/master/prometheus
 11 |     applications.sink.prometheusServlet.path=/metrics/applications/prometheus
 12 |     driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
 13 |     executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
 14 |   prometheus.yaml: |-
 15 |     lowercaseOutputName: true
 16 |     attrNameSnakeCase: true
 17 |     rules:
 18 |       # These come from the application driver if it's a streaming application
 19 |       # Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay
 20 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.(\S+)\.StreamingMetrics\.streaming\.(\S+), type=gauges><>Value
 21 |         name: spark_streaming_driver_$4
 22 |         labels:
 23 |           app_namespace: "$1"
 24 |           app_id: "$2"
 25 |       # These come from the application driver if it's a structured streaming application
 26 |       # Example: default/streaming.driver.spark.streaming.QueryName.inputRate-total
 27 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.spark\.streaming\.(\S+)\.(\S+), type=gauges><>Value
 28 |         name: spark_structured_streaming_driver_$4
 29 |         labels:
 30 |           app_namespace: "$1"
 31 |           app_id: "$2"
 32 |           query_name: "$3"
 33 |       # These come from the application executors
 34 |       # Example: default/spark-pi.0.executor.threadpool.activeTasks
 35 |       - pattern: metrics<name=(\S+)\.(\S+)\.(\S+)\.executor\.(\S+), type=gauges><>Value
 36 |         name: spark_executor_$4
 37 |         type: GAUGE
 38 |         labels:
 39 |           app_namespace: "$1"
 40 |           app_id: "$2"
 41 |           executor_id: "$3"
 42 |       # These come from the application driver
 43 |       # Example: default/spark-pi.driver.DAGScheduler.stage.failedStages
 44 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.(BlockManager|DAGScheduler|jvm)\.(\S+), type=gauges><>Value
 45 |         name: spark_driver_$3_$4
 46 |         type: GAUGE
 47 |         labels:
 48 |           app_namespace: "$1"
 49 |           app_id: "$2"
 50 |       # These come from the application driver
 51 |       # Emulate timers for DAGScheduler like messagePRocessingTime
 52 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.DAGScheduler\.(.*), type=counters><>Count
 53 |         name: spark_driver_DAGScheduler_$3_count
 54 |         type: COUNTER
 55 |         labels:
 56 |           app_namespace: "$1"
 57 |           app_id: "$2"
 58 |       # HiveExternalCatalog is of type counter
 59 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.HiveExternalCatalog\.(.*), type=counters><>Count
 60 |         name: spark_driver_HiveExternalCatalog_$3_count
 61 |         type: COUNTER
 62 |         labels:
 63 |           app_namespace: "$1"
 64 |           app_id: "$2"
 65 |       # These come from the application driver
 66 |       # Emulate histograms for CodeGenerator
 67 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.CodeGenerator\.(.*), type=counters><>Count
 68 |         name: spark_driver_CodeGenerator_$3_count
 69 |         type: COUNTER
 70 |         labels:
 71 |           app_namespace: "$1"
 72 |           app_id: "$2"
 73 |       # These come from the application driver
 74 |       # Emulate timer (keep only count attribute) plus counters for LiveListenerBus
 75 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*), type=counters><>Count
 76 |         name: spark_driver_LiveListenerBus_$3_count
 77 |         type: COUNTER
 78 |         labels:
 79 |           app_namespace: "$1"
 80 |           app_id: "$2"
 81 |       # Get Gauge type metrics for LiveListenerBus
 82 |       - pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*), type=gauges><>Value
 83 |         name: spark_driver_LiveListenerBus_$3
 84 |         type: GAUGE
 85 |         labels:
 86 |           app_namespace: "$1"
 87 |           app_id: "$2"
 88 |       # Executors counters
 89 |       - pattern: metrics<name=(\S+)\.(\S+)\.(.*)\.executor\.(.*), type=counters><>Count
 90 |         name: spark_executor_$4_count
 91 |         type: COUNTER
 92 |         labels:
 93 |           app_namespace: "$1"
 94 |           app_id: "$2"
 95 |           executor_id: "$3"
 96 |       # These come from the application executors
 97 |       # Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks
 98 |       - pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.(jvm|NettyBlockTransfer)\.(.*), type=gauges><>Value
 99 |         name: spark_executor_$4_$5
100 |         type: GAUGE
101 |         labels:
102 |           app_namespace: "$1"
103 |           app_id: "$2"
104 |           executor_id: "$3"
105 |       - pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.HiveExternalCatalog\.(.*), type=counters><>Count
106 |         name: spark_executor_HiveExternalCatalog_$4_count
107 |         type: COUNTER
108 |         labels:
109 |           app_namespace: "$1"
110 |           app_id: "$2"
111 |           executor_id: "$3"
112 |       # These come from the application driver
113 |       # Emulate histograms for CodeGenerator
114 |       - pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.CodeGenerator\.(.*), type=counters><>Count
115 |         name: spark_executor_CodeGenerator_$4_count
116 |         type: COUNTER
117 |         labels:
118 |           app_namespace: "$1"
119 |           app_id: "$2"
120 |           executor_id: "$3"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🚀 Kubeflow Spark Operator - Tutorial Completo  
  2 | 
  3 | Este repositório contém um tutorial atualizado sobre o **Kubeflow Spark Operator**, abordando desde conceitos fundamentais até um *hands-on* completo.  
  4 | 
  5 | ## 🎯 Motivação  
  6 | 
  7 | O **Spark Operator** foi recentemente migrado para a comunidade do **Kubeflow**, e algumas mudanças ocorreram no projeto. Este tutorial tem como objetivo trazer conteúdos atualizados sobre essa nova fase do operador, permitindo que você aproveite ao máximo essa integração com o Kubernetes.  
  8 | 
  9 | ## 📌 O que será abordado?  
 10 | 
 11 | Este tutorial será dividido nos seguintes tópicos:  
 12 | 
 13 | 1. **Revisar a arquitetura do Spark**  
 14 | 2. **Introdução ao Kubeflow Spark Operator**  
 15 | 3. **Hands-on de um ambiente local**  
 16 | 4. **Criar uma imagem do Spark customizada com Iceberg**  
 17 | 5. **Deploy de um EKS com autoscaling na AWS (Terraform + Karpenter)**  
 18 | 6. **Instalação e utilização do ArgoCD para deploy do Spark Operator no EKS**  
 19 | 7. **Executar um job PySpark com Iceberg integrado ao S3**  
 20 | 8. **Boas práticas de redução de custo**  
 21 |     - Instâncias *spot* e *on-demand*  
 22 |     - *Node-affinity* e *pod-affinity*  
 23 | 
 24 | ---
 25 | 
 26 | ## 📌 Arquitetura do Apache Spark  
 27 | 
 28 | Vamos recapitular brevemente a arquitetura do Apache Spark em alto nível.  
 29 | 
 30 | ![Arquitetura do Apache Spark](<caminho_para_a_imagem>)  
 31 | 
 32 | No Spark, temos três componentes principais:  
 33 | 
 34 | - **Driver**: Gerencia a execução do aplicativo Spark, comunicando com o *Cluster Manager*, requisitando CPU e Memória, distribui, monitora e agenda tarefas nos *Executors*.
 35 | - **Executors**: São processos que executam tarefas do aplicativo Spark. Eles são responsáveis por executar o código do usuário, armazenar dados em memória ou disco e retornar resultados ao *Driver*. São alocados nos *Workers*.
 36 | - **Cluster Manager**: Coordena a alocação de recursos, como CPU e memória em um cluster. Atualmente, o Spark suporta *Standalone*, *Mesos*, *YARN* e *Kubernetes*.
 37 | 
 38 | ---
 39 | 
 40 | ## 🚀 Vantagens do Spark no Kubernetes  
 41 | 
 42 | O **Kubernetes** é um orquestrador de containers amplamente utilizado, e sua adoção no ecossistema de Big Data vem crescendo devido às suas vantagens:  
 43 | 
 44 | 1. **Escalabilidade e gerenciamento automatizado**  
 45 |    - O Kubernetes trabalha de maneira declarativa, garantindo que os recursos necessários sejam provisionados automaticamente.  
 46 | 
 47 | 2. **Ambiente unificado**  
 48 |    - Permite que times de software e dados compartilhem a mesma infraestrutura.  
 49 | 
 50 | 3. **Customização avançada**  
 51 |    - Suporte para diferentes tipos de máquinas, *node pools*, *affinity rules*, entre outras configurações.  
 52 | 
 53 | 4. **Agnóstico a provedores de nuvem**  
 54 |    - Funciona tanto em ambientes gerenciados (*EKS*, *GKE*, *AKS*) quanto em clusters *on-premise*.  
 55 | 
 56 | 5. **Integração com ferramentas do ecossistema Kubernetes**  
 57 |    - Observabilidade, monitoramento, *service mesh*, CI/CD, entre outras.  
 58 | 
 59 | ---
 60 | 
 61 | ## 📌 Introdução ao Kubeflow Spark Operator  
 62 | 
 63 | Agora que revisamos a arquitetura do Spark, vamos entender como funciona o **Kubeflow Spark Operator**.  
 64 | 
 65 | 📌 **Importante**: Do ponto de vista do código Spark (PySpark, Spark SQL, Streaming, etc.), nada muda. A diferença está na forma como declaramos e submetemos os *jobs* no Kubernetes.  
 66 | 
 67 | O Kubernetes trabalha com arquivos YAML declarativos, e o **Spark Operator** segue essa mesma abordagem. Para rodar um job Spark, precisamos de um manifesto YAML que descreve a execução.  
 68 | 
 69 | ```yaml
 70 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 71 | kind: SparkApplication
 72 | metadata:
 73 |   name: spark-pi
 74 |   namespace: default
 75 | spec:
 76 |   type: Python
 77 |   mode: cluster
 78 |   image: "tiagotxm/spark:3.5.3-demo-yt"
 79 |   imagePullPolicy: Always
 80 |   mainApplicationFile: "local:///opt/spark/examples/src/main/python/pi.py"
 81 |   sparkVersion: "3.5.3"
 82 |   restartPolicy:
 83 |     type: Never
 84 |   driver:
 85 |     cores: 1
 86 |     coreLimit: "1200m"
 87 |     memory: "512m"
 88 |     labels:
 89 |       version: 3.5.3
 90 |     serviceAccount: spark-operator-spark
 91 |   executor:
 92 |     cores: 1
 93 |     instances: 1
 94 |     memory: "512m"
 95 |     labels:
 96 |       version: 3.5.3
 97 | 
 98 | ```
 99 | 
100 | ## 🛠 Hands-on: Ambiente Local
101 | 
102 | ### 🔹 Passo 1: Instalar o Kind
103 | - https://kind.sigs.k8s.io/
104 | 
105 | 
106 | ### 🔹 Passo 2: Criar um Cluster Kubernetes
107 | ```
108 | kind create cluster --config infra/local/kind.yaml
109 | ```
110 | 
111 | ### 🔹 Passo 3: Instalar o Spark Operator
112 | ```
113 | helm repo add spark-operator https://kubeflow.github.io/spark-operator
114 | helm repo update
115 | 
116 | # Install the operator into the spark-operator namespace and wait for deployments to be ready
117 | helm install spark-operator spark-operator/spark-operator \
118 |     --namespace spark-operator --create-namespace --wait
119 | ```
120 | 
121 | ### 🔹 Passo 4: Criar um job Spark
122 | ```yaml
123 | apiVersion: "sparkoperator.k8s.io/v1beta2"
124 | kind: SparkApplication
125 | metadata:
126 |   name: spark-pi
127 |   namespace: default
128 | spec:
129 |   type: Python
130 |   mode: cluster
131 |   image: "tiagotxm/spark:3.5.3-demo-yt"
132 |   imagePullPolicy: Always
133 |   mainApplicationFile: "local:///opt/spark/examples/src/main/python/pi.py"
134 |   sparkVersion: "3.5.3"
135 |   restartPolicy:
136 |     type: Never
137 |   volumes:
138 |     - name: "test-volume"
139 |       hostPath:
140 |         path: "/tmp"
141 |         type: Directory
142 |   driver:
143 |     cores: 1
144 |     coreLimit: "1200m"
145 |     memory: "512m"
146 |     labels:
147 |       version: 3.5.3
148 |     serviceAccount: spark-operator-spark
149 |     volumeMounts:
150 |       - name: "test-volume"
151 |         mountPath: "/tmp"
152 |   executor:
153 |     cores: 1
154 |     instances: 1
155 |     memory: "512m"
156 |     labels:
157 |       version: 3.5.3
158 |     volumeMounts:
159 |       - name: "test-volume"
160 |         mountPath: "/tmp"
161 | 
162 | ```
163 | 
164 | * Aplique o YAML no cluster:
165 | ```
166 | kubectl apply -f spark-pi.yaml
167 | ```
168 | * Para verificar o status do job:
169 | ```
170 | kubectl get sparkapplications
171 | ```
172 | 
173 | * Para ver os logs
174 | ```
175 | kubectl logs -f <nome-do-pod>
176 | ```
177 | 
178 | * Para deletar um job
179 | ```
180 | kubectl delete sparkapplication <spark-app-name>
181 | ```
182 | 
183 | ## 🛠 Criando uma Imagem do Spark Customizada
184 | 
185 | ### 🔹 Criando um Dockerfile
186 | 
187 | - Para criar uma imagem base do Spark, utilize este
188 | [Dockerfile](Dockerfile.base)
189 | 
190 | - Para instalar dependências do Iceberg, utilize este [Dockerfile.iceberg](Dockerfile)
191 | 
192 | 
193 | 
194 | ### 🔹 Build e Push da imagem para o Docker Hub
195 | O build será baseado na arquitetura do seu processador.
196 | No meu caso, estou utilizando um processador ARM64, então a imagem será construída para essa arquitetura.
197 | ```
198 | docker build -t <seu-usuario>/spark-iceberg:latest .
199 | docker push <seu-usuario>/spark-iceberg:latest
200 | ```
201 | 
202 | ### Tip: 
203 | Realize o build da imagem do Spark para multiarquitetura, assim você poderá utilizá-la em diferentes processadores.
204 | ```
205 | docker buildx build --platform linux/amd64,linux/arm64 -t <nome_da_imagem> --push .
206 | ```
207 | 
208 | ---
209 | 
210 | ## 📌 Kubeflow Spark Operator em Produção
211 | 
212 | Nesta seção, vamos abordar a execução de um job PySpark com Iceberg integrado ao S3 em um ambiente de produção.
213 | Para isso iremos utilizar o EKS (Elastic Kubernetes Service) da AWS configurado com autoscaling(Karpenter) e o ArgoCD para deploy do Spark Operator.
214 | 
215 | ## Requisitos
216 | - AWS CLI instalado e configurado
217 | - K9s
218 | 
219 | ### 🔹 Deploy do EKS com Karpenter
220 | ````
221 | $ terraform init
222 | $ terraform apply -auto-approve
223 | ````
224 | 
225 | ### 🔹 Autenticação local com cluster EKS
226 | ```
227 | $ aws eks update-kubeconfig --name eks-data --region us-east-1                                  
228 | ```
229 | 
230 | ### 🔹 Iniciando a sessão do k9s
231 | ```
232 | $ k9s
233 | ```
234 | 
235 | 
236 | ## ArgoCD
237 | Cria namespace para o Argo
238 | ````
239 | $ kubectl create namespace argocd
240 | ````
241 | 
242 | Instala o ArgoCD
243 | ````
244 | $ kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml
245 | 
246 | ````
247 | Recupera a senha do usuário admin para logar na UI
248 | ````
249 | kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d
250 | ````
251 | 
252 | Criação da chave SSH para autenticação com o Argo
253 | ````
254 | $ ssh-keygen -t rsa -b 4096 -C "<github_email>"
255 | ````
256 | 
257 | 
258 | ### Links úteis e Referências
259 | - K9s - https://k9scli.io
260 | - Karpenter - https://karpenter.sh/docs/
261 | - IRSA - https://docs.aws.amazon.com/eks/latest/userguide/associate-service-account-role.html
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------