├── .gitignore
├── cr.yaml
├── ct.yaml
├── charts
    └── pyspark-notebook
    │   ├── templates
    │       ├── serviceaccount.yaml
    │       ├── rolebinding.yaml
    │       ├── role.yaml
    │       ├── NOTES.txt
    │       ├── service.yaml
    │       ├── ingress.yaml
    │       ├── _helpers.tpl
    │       └── statefulset.yaml
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── values.yaml
    │   └── README.md
├── .github
    └── workflows
    │   ├── release.yml
    │   └── lint-test.yml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | 
4 | node_modules/
5 | lib/
6 | 
7 | logs/
8 | *.log*
9 | 


--------------------------------------------------------------------------------
/cr.yaml:
--------------------------------------------------------------------------------
1 | # Set to true for GPG signing
2 | sign: false
3 | # UID of the GPG key to use
4 | key: Chart Releaser Test Key


--------------------------------------------------------------------------------
/ct.yaml:
--------------------------------------------------------------------------------
1 | # See https://github.com/helm/chart-testing#configuration
2 | remote: origin
3 | target-branch: master
4 | chart-dirs:
5 |   - charts
6 | chart-repos:
7 |   - pyspark-notebook=https://a3data.github.io/pyspark-notebook-helm/


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "pyspark.serviceAccountName" . }}
 6 |   labels:
 7 |     {{- include "pyspark.labels" . | nindent 4 }}
 8 |   {{- with .Values.serviceAccount.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | {{- end }}
13 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | {{- $fullName := include "pyspark.fullname" . -}}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: RoleBinding
 4 | metadata:
 5 |   name: {{ $fullName | quote }}
 6 |   labels:
 7 |     {{- include "pyspark.labels" . | nindent 4 }}
 8 | subjects:
 9 |   - kind: ServiceAccount
10 |     name: {{ include "pyspark.serviceAccountName" . }}
11 |     namespace: {{ .Release.Namespace | quote }}
12 | roleRef:
13 |   kind: Role
14 |   name: {{ $fullName | quote }}
15 |   apiGroup: rbac.authorization.k8s.io
16 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release Charts
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   release:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v2
14 |         with:
15 |           fetch-depth: 0
16 | 
17 |       - name: Configure Git
18 |         run: |
19 |           git config user.name "$GITHUB_ACTOR"
20 |           git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
21 | 
22 |       - name: Run chart-releaser
23 |         uses: helm/chart-releaser-action@v1.1.0
24 |         env:
25 |           CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
26 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/role.yaml:
--------------------------------------------------------------------------------
 1 | {{- $fullName := include "pyspark.fullname" . -}}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   name: {{ $fullName | quote }}
 6 |   labels:
 7 |     {{- include "pyspark.labels" . | nindent 4 }}
 8 | rules:
 9 |   - apiGroups:
10 |     - ""
11 |     resources:
12 |     - pods
13 |     verbs:
14 |     - create
15 |     - get
16 |     - delete
17 |     - list
18 |     - watch
19 |   - apiGroups:
20 |     - ""
21 |     resources:
22 |     - services
23 |     verbs:
24 |     - get
25 |     - create
26 |   - apiGroups:
27 |     - ""
28 |     resources:
29 |     - pods/log
30 |     verbs:
31 |     - get
32 |     - list
33 |   - apiGroups:
34 |     - ""
35 |     resources:
36 |     - pods/exec
37 |     verbs:
38 |     - create
39 |     - get
40 |   - apiGroups:
41 |     - ""
42 |     resources:
43 |     - configmaps
44 |     verbs:
45 |     - get
46 |     - create
47 |     - list
48 |     - watch
49 |     - delete
50 | 


--------------------------------------------------------------------------------
/.github/workflows/lint-test.yml:
--------------------------------------------------------------------------------
 1 | name: Lint and Test Charts
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   lint-test:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Checkout
10 |         uses: actions/checkout@v2
11 |         with:
12 |           fetch-depth: 0
13 | 
14 |       - name: Set up Helm
15 |         uses: azure/setup-helm@v1
16 |         with:
17 |           version: v3.4.1
18 | 
19 |       # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
20 |       # yamllint (https://github.com/adrienverge/yamllint) which require Python
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v2
23 |         with:
24 |           python-version: 3.7
25 | 
26 |       - name: Set up chart-testing
27 |         uses: helm/chart-testing-action@v2.0.1
28 |         with:
29 |           version: v3.3.0
30 |           
31 |       - name: Run chart-testing (lint)
32 |         run: ct lint --config ct.yaml
33 | 
34 |       - name: Create kind cluster
35 |         uses: helm/kind-action@v1.2.0
36 | 
37 |       - name: Run chart-testing (install)
38 |         run: ct install --config ct.yaml


--------------------------------------------------------------------------------
/charts/pyspark-notebook/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: pyspark-notebook
 3 | description: A Kubernetes Helm chart for deploying Pyspark Notebook
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.2.2
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | maintainers:
26 | - email: marcusvoliveira@dcc.ufmg.br
27 |   name: voidmarcus
28 | - email: maylateixeira2013@gmail.com
29 |   name: maylatt
30 | - email: neylsoncrepalde@gmail.com
31 |   name: neylsoncrepalde
32 | 
33 | sources:
34 |   - https://github.com/a3data/pyspark-notebook-helm
35 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 1. Get the application URL by running these commands:
 2 | {{- if .Values.ingress.enabled }}
 3 | {{- range $host := .Values.ingress.hosts }}
 4 |   {{- range .paths }}
 5 |   http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
 6 |   {{- end }}
 7 | {{- end }}
 8 | {{- else if contains "NodePort" .Values.service.type }}
 9 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "pyspark.fullname" . }})
10 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
11 |   echo http://$NODE_IP:$NODE_PORT
12 | {{- else if contains "LoadBalancer" .Values.service.type }}
13 |      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
14 |            You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "pyspark.fullname" . }}'
15 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "pyspark.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
16 |   echo http://$SERVICE_IP:{{ .Values.service.port }}
17 | {{- else if contains "ClusterIP" .Values.service.type }}
18 |   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "pyspark.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
19 |   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
20 |   echo "Visit http://127.0.0.1:8080 to use your application"
21 |   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
22 | {{- end }}
23 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: {{ include "pyspark.fullname" . }}
 6 |   labels:
 7 |     {{- include "pyspark.labels" . | nindent 4 }}
 8 |   annotations:
 9 | {{ toYaml .Values.service.annotations | indent 4 }}
10 | spec:
11 |   type: {{ .Values.service.type }}
12 |   selector:
13 |     {{- include "pyspark.selectorLabels" . | nindent 4 }}
14 |   ports:
15 |     - name: {{ .Values.service.httpPortName | default "http" }}
16 |       protocol: TCP
17 |       port: {{ .Values.httpPort }}
18 |       targetPort: http
19 |   {{- if .Values.service.nodePort }}
20 |       nodePort: {{ .Values.service.nodePort }}
21 |   {{- end }}
22 |     - name: {{ .Values.service.blockManagerPortName | default "blockmanager" }}
23 |       protocol: TCP
24 |       port: {{ .Values.blockManagerPort }}
25 |       targetPort: blockmanager
26 |     - name: {{ .Values.service.driverPortName | default "driver" }}
27 |       protocol: TCP
28 |       port: {{ .Values.driverPort }}
29 |       targetPort: driver
30 |   {{- if .Values.service.loadBalancerIP }}
31 |   loadBalancerIP: {{ .Values.service.loadBalancerIP }}
32 |   {{- end }}
33 |   {{- with .Values.service.loadBalancerSourceRanges }}
34 |   loadBalancerSourceRanges:
35 |   {{ toYaml . | indent 4 }}
36 |   {{- end }}
37 |   {{- if .Values.service.externalTrafficPolicy }}
38 |   externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy }}
39 |   {{- end }}
40 | 
41 | ---
42 | apiVersion: v1
43 | kind: Service
44 | metadata:
45 |   name: {{ template "pyspark.fullname" . }}-headless
46 |   labels:
47 |     {{- include "pyspark.labels" . | nindent 4 }}
48 |   annotations:
49 |     service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
50 | spec:
51 |   clusterIP: None # This is needed for statefulset hostnames like pyspark-0 to resolve
52 |   # Create endpoints also if the related pod isn't ready
53 |   publishNotReadyAddresses: true 
54 |   selector:
55 |     app: "{{ template "pyspark.fullname" . }}"
56 |   ports:
57 |   - name: {{ .Values.service.httpPortName | default "http" }}
58 |     port: {{ .Values.httpPort }}
59 |   - name: {{ .Values.service.blockManagerPortName | default "blockmanager" }}
60 |     port: {{ .Values.blockManagerPort }}
61 |   - name: {{ .Values.service.driverPortName | default "driver" }}
62 |     port: {{ .Values.driverPort }}


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/ingress.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.ingress.enabled -}}
 2 | {{- $fullName := include "pyspark.fullname" . -}}
 3 | {{- $svcPort := .Values.service.port -}}
 4 | {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
 5 |   {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
 6 |   {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
 7 |   {{- end }}
 8 | {{- end }}
 9 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
10 | apiVersion: networking.k8s.io/v1
11 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
12 | apiVersion: networking.k8s.io/v1beta1
13 | {{- else -}}
14 | apiVersion: extensions/v1beta1
15 | {{- end }}
16 | kind: Ingress
17 | metadata:
18 |   name: {{ $fullName }}
19 |   labels:
20 |     {{- include "pyspark.labels" . | nindent 4 }}
21 |   {{- with .Values.ingress.annotations }}
22 |   annotations:
23 |     {{- toYaml . | nindent 4 }}
24 |   {{- end }}
25 | spec:
26 |   {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
27 |   ingressClassName: {{ .Values.ingress.className }}
28 |   {{- end }}
29 |   {{- if .Values.ingress.tls }}
30 |   tls:
31 |     {{- range .Values.ingress.tls }}
32 |     - hosts:
33 |         {{- range .hosts }}
34 |         - {{ . | quote }}
35 |         {{- end }}
36 |       secretName: {{ .secretName }}
37 |     {{- end }}
38 |   {{- end }}
39 |   rules:
40 |     {{- range .Values.ingress.hosts }}
41 |     - host: {{ .host | quote }}
42 |       http:
43 |         paths:
44 |           {{- range .paths }}
45 |           - path: {{ .path }}
46 |             {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
47 |             pathType: {{ .pathType }}
48 |             {{- end }}
49 |             backend:
50 |               {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
51 |               service:
52 |                 name: {{ $fullName }}
53 |                 port:
54 |                   number: {{ $svcPort }}
55 |               {{- else }}
56 |               serviceName: {{ $fullName }}
57 |               servicePort: {{ $svcPort }}
58 |               {{- end }}
59 |           {{- end }}
60 |     {{- end }}
61 | {{- end }}
62 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "pyspark.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "pyspark.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "pyspark.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "pyspark.labels" -}}
37 | helm.sh/chart: {{ include "pyspark.chart" . }}
38 | {{ include "pyspark.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "pyspark.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "pyspark.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "pyspark.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "pyspark.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 
64 | {{/*
65 | Return the appropriate apiVersion for statefulset.
66 | */}}
67 | {{- define "pyspark.statefulset.apiVersion" -}}
68 | {{- if semverCompare "<1.9-0" .Capabilities.KubeVersion.GitVersion -}}
69 | {{- print "apps/v1beta2" -}}
70 | {{- else -}}
71 | {{- print "apps/v1" -}}
72 | {{- end -}}
73 | {{- end -}}
74 | 
75 | {{/*
76 | Return custom environment variables
77 | */}}
78 | {{- define "pyspark.customEnvironment" }}
79 |   # Dynamically created environment variables
80 |   {{- range $i, $config := .Values.env }}
81 |   - name: {{ $config.name }}
82 |     value: {{ $config.value | quote }}
83 |   {{- end }}
84 |   # Dynamically created secret envs
85 |   {{- range $i, $config := .Values.secret }}
86 |   - name: {{ $config.envName }}
87 |     valueFrom:
88 |       secretKeyRef:
89 |         name: {{ $config.secretName }}
90 |         key: {{ default "value" $config.secretKey }}
91 |   {{- end }}
92 | {{- end }}


--------------------------------------------------------------------------------
/charts/pyspark-notebook/values.yaml:
--------------------------------------------------------------------------------
  1 | # Default values for pyspark.
  2 | # This is a YAML-formatted file.
  3 | # Declare variables to be passed into your templates.
  4 | 
  5 | replicaCount: 1
  6 | 
  7 | image:
  8 |   repository: jupyter/pyspark-notebook
  9 |   pullPolicy: IfNotPresent
 10 |   tag: "spark-3.1.2"
 11 | 
 12 | imagePullSecrets: []
 13 | nameOverride: ""
 14 | fullnameOverride: ""
 15 | 
 16 | volumeClaimTemplate:
 17 |   accessModes: ["ReadWriteOnce"]
 18 |   volumeMode: Filesystem
 19 |   resources:
 20 |     requests:
 21 |       storage: 10Gi
 22 |   storageClassName:
 23 | 
 24 | persistence:
 25 |   enabled: true
 26 |   labels:
 27 |     # Add default labels for the volumeClaimTemplate fo the StatefulSet
 28 |     enabled: false
 29 |   annotations: {}
 30 | 
 31 | # Extra environment variables to append to pyspark container
 32 | # This will be appended to the current 'env:' key. You can use any of the kubernetes env
 33 | # syntax here
 34 | env: []
 35 |   # - name: GOOGLE_APPLICATION_CREDENTIALS
 36 |   #   value: /mnt/secrets/key.json
 37 | 
 38 | # Allows you to load environment variables from kubernetes secret
 39 | secret: []
 40 | # - envName: AWS_ACCESS_KEY_ID
 41 | #   secretName: ""
 42 | #   secretKey: ""
 43 | # - envName: AWS_SECRET_ACCESS_KEY
 44 | #   secretName: ""
 45 | #   secretKey: ""
 46 | 
 47 | # Mount additional volumes into pyspark container.
 48 | extraVolumes: []
 49 | #   - name: secrets
 50 | #     secret:
 51 | #       secretName: gcp-credentials
 52 | 
 53 | extraVolumeMounts: []
 54 | #   - name: secrets
 55 | #     mountPath: "/mnt/secrets"
 56 | #     readOnly: true
 57 | 
 58 | serviceAccount:
 59 |   # Specifies whether a service account should be created
 60 |   create: true
 61 |   # Annotations to add to the service account
 62 |   annotations: {}
 63 |   # The name of the service account to use.
 64 |   # If not set and create is true, a name is generated using the fullname template
 65 |   name: ""
 66 | 
 67 | podAnnotations: {}
 68 | 
 69 | disableToken: false
 70 | 
 71 | podSecurityContext:
 72 |   fsGroup: 100
 73 |   runAsUser: 1000
 74 | 
 75 | securityContext:
 76 |   capabilities:
 77 |     drop:
 78 |     - ALL
 79 |   # readOnlyRootFilesystem: true
 80 |   runAsNonRoot: true
 81 |   runAsUser: 1000
 82 | 
 83 | # The default is to deploy all pods serially. By setting this to parallel all pods are started at
 84 | # the same time when bootstrapping the cluster
 85 | podManagementPolicy: "Parallel"
 86 | 
 87 | httpPort: 8888
 88 | blockManagerPort: 7777
 89 | driverPort: 2222
 90 | 
 91 | service:
 92 |   type: ClusterIP
 93 |   nodePort: ""
 94 |   annotations: {}
 95 |   httpPortName: http
 96 |   blockManagerPortName: blockmanager
 97 |   driverPortName: driver
 98 |   loadBalancerIP: ""
 99 |   loadBalancerSourceRanges: []
100 |   externalTrafficPolicy: ""
101 | 
102 | updateStrategy: RollingUpdate
103 | 
104 | # How long to wait for pyspark to stop gracefully
105 | terminationGracePeriod: 30
106 | 
107 | ingress:
108 |   enabled: false
109 |   className: ""
110 |   annotations: {}
111 |     # kubernetes.io/ingress.class: nginx
112 |     # kubernetes.io/tls-acme: "true"
113 |   hosts:
114 |     - host: chart-example.local
115 |       paths:
116 |         - path: /
117 |           pathType: ImplementationSpecific
118 |   tls: []
119 |   #  - secretName: chart-example-tls
120 |   #    hosts:
121 |   #      - chart-example.local
122 | 
123 | resources:
124 |   requests:
125 |     cpu: "1000m"
126 |     memory: "2Gi"
127 |   limits:
128 |     cpu: "2000m"
129 |     memory: "16Gi"
130 | 
131 | nodeSelector: {}
132 | 
133 | tolerations: []
134 | 
135 | affinity: {}
136 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/templates/statefulset.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | apiVersion: {{ template "pyspark.statefulset.apiVersion" . }}
  3 | kind: StatefulSet
  4 | metadata:
  5 |   name: {{ template "pyspark.fullname" . }}
  6 |   labels:
  7 |     {{- include "pyspark.labels" . | nindent 4 }}
  8 | spec:
  9 |   serviceName: {{ template "pyspark.fullname" . }}-headless
 10 |   selector:
 11 |     matchLabels:
 12 |       {{- include "pyspark.selectorLabels" . | nindent 6 }}
 13 |   replicas: {{ .Values.replicaCount }}
 14 |   podManagementPolicy: {{ .Values.podManagementPolicy }}
 15 |   updateStrategy:
 16 |     type: {{ .Values.updateStrategy }}
 17 |   {{- if .Values.persistence.enabled }}
 18 |   volumeClaimTemplates:
 19 |   - metadata:
 20 |       name: {{ template "pyspark.fullname" . }}-notebooks
 21 |     {{- if .Values.persistence.labels.enabled }}
 22 |       labels:
 23 |         {{- include "pyspark.labels" . | nindent 8 }}
 24 |     {{- end }}
 25 |     {{- with .Values.persistence.annotations  }}
 26 |       annotations:
 27 | {{ toYaml . | indent 8 }}
 28 |     {{- end }}
 29 |     spec:
 30 | {{ toYaml .Values.volumeClaimTemplate | indent 6 }}
 31 |   {{- end }}
 32 |   template:
 33 |     metadata:
 34 |       # name: "{{ template "pyspark.fullname" . }}"
 35 |       {{- with .Values.podAnnotations }}
 36 |       annotations:
 37 |         {{- toYaml . | nindent 8 }}
 38 |       {{- end }}
 39 |       labels:
 40 |         {{- include "pyspark.selectorLabels" . | nindent 8 }}
 41 |     spec:
 42 |       terminationGracePeriodSeconds: {{ .Values.terminationGracePeriod }}
 43 |       {{- with .Values.imagePullSecrets }}
 44 |       imagePullSecrets:
 45 |         {{- toYaml . | nindent 8 }}
 46 |       {{- end }}
 47 |       serviceAccountName: {{ include "pyspark.serviceAccountName" . }}
 48 |       securityContext:
 49 |         {{- toYaml .Values.podSecurityContext | nindent 8 }}
 50 |       volumes:
 51 |       {{- if .Values.extraVolumes }}
 52 |       {{- toYaml .Values.extraVolumes | nindent 8 }}
 53 |       {{- end }}
 54 |       containers:
 55 |         - name: {{ .Chart.Name }}
 56 |           securityContext:
 57 |             {{- toYaml .Values.securityContext | nindent 12 }}
 58 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
 59 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
 60 |           command: ["start.sh"]
 61 |           args:
 62 |             - "jupyter"
 63 |             - "lab"
 64 |           {{- if .Values.disableToken }}
 65 |             - "--LabApp.token=''"
 66 |           {{- end }}
 67 |           ports:
 68 |             - name: http
 69 |               containerPort: 8888
 70 |               protocol: TCP
 71 |             - name: blockmanager
 72 |               containerPort: 7777
 73 |               protocol: TCP
 74 |             - name: driver
 75 |               containerPort: 2222
 76 |               protocol: TCP
 77 |           livenessProbe:
 78 |             httpGet:
 79 |               path: /
 80 |               port: http
 81 |           readinessProbe:
 82 |             httpGet:
 83 |               path: /
 84 |               port: http
 85 |           resources:
 86 |             {{- toYaml .Values.resources | nindent 12 }}
 87 |           volumeMounts:
 88 |             {{- if .Values.persistence.enabled }}
 89 |             - name: {{ template "pyspark.fullname" . }}-notebooks
 90 |               mountPath: /home/jovyan/work/
 91 |             {{- end }}
 92 |             {{- if .Values.extraVolumeMounts }}
 93 |             {{- toYaml .Values.extraVolumeMounts | nindent 12 }}
 94 |             {{- end }}
 95 |           env:
 96 |           {{- include "pyspark.customEnvironment" . | indent 10 }}
 97 |       {{- with .Values.nodeSelector }}
 98 |       nodeSelector:
 99 |         {{- toYaml . | nindent 8 }}
100 |       {{- end }}
101 |       {{- with .Values.affinity }}
102 |       affinity:
103 |         {{- toYaml . | nindent 8 }}
104 |       {{- end }}
105 |       {{- with .Values.tolerations }}
106 |       tolerations:
107 |         {{- toYaml . | nindent 8 }}
108 |       {{- end }}
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pyspark Notebook Helm Chart
  2 | 
  3 | ## Introduction
  4 | This repo provides 
  5 | the Kubernetes [Helm](https://helm.sh/) chart for deploying 
  6 | [Pyspark Notebook](https://hub.docker.com/r/jupyter/pyspark-notebook).
  7 | 
  8 | ## Setup
  9 | 1. Set up a kubernetes cluster
 10 |    - In a cloud platform of choice like [Amazon EKS](https://aws.amazon.com/eks), 
 11 |      [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine), 
 12 |      and [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service/) OR
 13 |    - In local environment using [Minikube](https://minikube.sigs.k8s.io/docs/).
 14 | 2. Install the following tools: 
 15 |    - [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources
 16 |    - [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. 
 17 |      Note, we only support Helm 3.
 18 |    
 19 | ## Quickstart
 20 | 
 21 | Add pyspark-notebook helm repo by running the following
 22 | 
 23 | ```(shell)
 24 | helm repo add pyspark-notebook https://a3data.github.io/pyspark-notebook-helm/
 25 | ```
 26 | 
 27 | Then, deploy the pyspark-notebook by running the following
 28 | 
 29 | ```(shell)
 30 | helm install pyspark-notebook pyspark-notebook/pyspark-notebook 
 31 | ```
 32 | 
 33 | Run `kubectl get all` to check whether all the pyspark resources are running. You should get a result similar to below.
 34 | 
 35 | ```
 36 | NAME            READY   STATUS    RESTARTS   AGE
 37 | pod/pyspark-0   1/1     Running   0          9m18s
 38 | 
 39 | NAME                       TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)                      AGE
 40 | service/pyspark            ClusterIP   10.110.1.129   <none>        8888/TCP,7777/TCP,2222/TCP   9m18s
 41 | service/pyspark-headless   ClusterIP   None           <none>        8888/TCP,7777/TCP,2222/TCP   9m18s
 42 | 
 43 | NAME                       READY   AGE
 44 | statefulset.apps/pyspark   1/1     9m18s
 45 | ```
 46 | 
 47 | You can run the following to expose the notebook locally. 
 48 | 
 49 | ```(shell)
 50 | kubectl port-forward svc/<release name> 8888:8888
 51 | ```
 52 | 
 53 | You should be able to access the frontend via http://localhost:8888. 
 54 | 
 55 | ## Get Token
 56 | 
 57 | ```(shell)
 58 | kubectl exec -it pod/pyspark-0 -- bash
 59 | jupyter server list
 60 | ```
 61 | 
 62 | ## LoadBalancer
 63 | 
 64 | ```sh
 65 | helm install pyspark-notebook pyspark-notebook/pyspark-notebook --set service.type=LoadBalancer
 66 | ```
 67 | 
 68 | 
 69 | ## GCP Example
 70 | 
 71 | Create secret
 72 | ```sh
 73 | kubectl create secret generic gcs-credentials --from-file="./config/key.json"
 74 | ```
 75 | Alter `values.yaml`
 76 | 
 77 | ```yaml
 78 | env: 
 79 |   - name: GOOGLE_APPLICATION_CREDENTIALS
 80 |     value: /mnt/secrets/key.json
 81 | 
 82 | extraVolumes: 
 83 |   - name: secrets
 84 |     secret:
 85 |       secretName: gcp-credentials
 86 | 
 87 | extraVolumeMounts:
 88 |   - name: secrets
 89 |     mountPath: "/mnt/secrets"
 90 |     readOnly: true 
 91 | ```
 92 | 
 93 | 
 94 | ## AWS Example
 95 | 
 96 | Create secret from a `key.json` file.
 97 | ```sh
 98 | kubectl create secret generic aws-credentials --from-file="./config/key.json"
 99 | ```
100 | 
101 | Or you can create a secret directly in the terminal:
102 | ```sh
103 | kubectl create secret generic aws-credentials --from-literal=aws_access_key_id=<YOUR_KEY_ID> --from-literal=aws_secret_access_key=<YOUR_SECRET_KEY> 
104 | ```
105 | 
106 | Alter `values.yaml` to set your AWS credentials as environment variables
107 | ```yaml
108 | # Allows you to load environment variables from kubernetes secret               
109 | secret:                                                                         
110 |   - envName: AWS_ACCESS_KEY_ID                                                  
111 |     secretName: aws-credentials                                                 
112 |     secretKey: aws_access_key_id                                                
113 |   - envName: AWS_SECRET_ACCESS_KEY                                              
114 |     secretName: aws-credentials                                                 
115 |     secretKey: aws_secret_access_key   
116 | ```
117 | 
118 | And deploy the helm chart with `helm install` command shown above.
119 | 
120 | For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example:
121 | ```python
122 | from pyspark import SparkConf, SparkContext
123 | from pyspark.sql import functions as f
124 | from pyspark.sql import SparkSession
125 | 
126 | #spark configuration
127 | conf = (
128 |     SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
129 |     .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
130 |     .set("spark.hadoop.fs.s3a.fast.upload", True)
131 |     .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
132 |     .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0')
133 |     .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
134 | )
135 | sc=SparkContext(conf=conf).getOrCreate()
136 | 
137 | spark=SparkSession(sc)
138 | 
139 | df = spark.read.parquet("s3a:/<BUCKET-NAME>/<TABLE-NAME>/")
140 | 
141 | df.printSchema()
142 | ```
143 | 
144 | Make sure the credentials you passed as env variables do have access to the S3 bucket.
145 | 
146 | 


--------------------------------------------------------------------------------
/charts/pyspark-notebook/README.md:
--------------------------------------------------------------------------------
  1 | # Pyspark Notebook Helm Chart
  2 | 
  3 | ## Introduction
  4 | This repo provides 
  5 | the Kubernetes [Helm](https://helm.sh/) chart for deploying 
  6 | [Pyspark Notebook](https://hub.docker.com/r/jupyter/pyspark-notebook).
  7 | 
  8 | ## Setup
  9 | 1. Set up a kubernetes cluster
 10 |    - In a cloud platform of choice like [Amazon EKS](https://aws.amazon.com/eks), 
 11 |      [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine), 
 12 |      and [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service/) OR
 13 |    - In local environment using [Minikube](https://minikube.sigs.k8s.io/docs/).
 14 | 2. Install the following tools: 
 15 |    - [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources
 16 |    - [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. 
 17 |      Note, we only support Helm 3.
 18 |    
 19 | ## Quickstart
 20 | 
 21 | Clone the repository
 22 | 
 23 | ```(shell)
 24 | git clone https://github.com/A3Data/pyspark-helm.git
 25 | ```
 26 | 
 27 | deploy Pyspark Notebook by running the following
 28 | 
 29 | ```(shell)
 30 | helm dependency update ./pyspark-helm/Chart.yaml
 31 | helm install pyspark ./pyspark-helm/ --values ./pyspark-helm/values.yaml
 32 | ```
 33 | 
 34 | Run `kubectl get all` to check whether all the pyspark resources are running. You should get a result similar to below.
 35 | 
 36 | ```
 37 | NAME            READY   STATUS    RESTARTS   AGE
 38 | pod/pyspark-0   1/1     Running   0          9m18s
 39 | 
 40 | NAME                       TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)                      AGE
 41 | service/pyspark            ClusterIP   10.110.1.129   <none>        8888/TCP,7777/TCP,2222/TCP   9m18s
 42 | service/pyspark-headless   ClusterIP   None           <none>        8888/TCP,7777/TCP,2222/TCP   9m18s
 43 | 
 44 | NAME                       READY   AGE
 45 | statefulset.apps/pyspark   1/1     9m18s
 46 | ```
 47 | 
 48 | You can run the following to expose the notebook locally. 
 49 | 
 50 | ```(shell)
 51 | kubectl port-forward svc/<release name> 8888:8888
 52 | ```
 53 | 
 54 | You should be able to access the frontend via http://localhost:8888. 
 55 | 
 56 | ## Get Token
 57 | 
 58 | ```(shell)
 59 | kubectl exec -it pod/pyspark-0 -- bash
 60 | jupyter server list
 61 | ```
 62 | 
 63 | ## LoadBalancer
 64 | 
 65 | ```sh
 66 | helm install pyspark ./pyspark-helm/ --values ./pyspark-helm/values.yaml --set service.type=LoadBalancer
 67 | ```
 68 | 
 69 | ## GCP Example
 70 | 
 71 | Create secret
 72 | ```sh
 73 | kubectl create secret generic gcs-credentials --from-file="./config/key.json"
 74 | ```
 75 | Alter `values.yaml`
 76 | 
 77 | ```yaml
 78 | env: 
 79 |   - name: GOOGLE_APPLICATION_CREDENTIALS
 80 |     value: /mnt/secrets/key.json
 81 | 
 82 | extraVolumes: 
 83 |   - name: secrets
 84 |     secret:
 85 |       secretName: gcp-credentials
 86 | 
 87 | extraVolumeMounts:
 88 |   - name: secrets
 89 |     mountPath: "/mnt/secrets"
 90 |     readOnly: true 
 91 | ```
 92 | 
 93 | 
 94 | ## AWS Example
 95 | 
 96 | Create secret from a `key.json` file.
 97 | ```sh
 98 | kubectl create secret generic aws-credentials --from-file="./config/key.json"
 99 | ```
100 | 
101 | Or you can create a secret directly in the terminal:
102 | ```sh
103 | kubectl create secret generic aws-credentials --from-literal=aws_access_key_id=<YOUR_KEY_ID> --from-literal=aws_secret_access_key=<YOUR_SECRET_KEY> 
104 | ```
105 | 
106 | Alter `values.yaml` to set your AWS credentials as environment variables
107 | ```yaml
108 | # Allows you to load environment variables from kubernetes secret               
109 | secret:                                                                         
110 |   - envName: AWS_ACCESS_KEY_ID                                                  
111 |     secretName: aws-credentials                                                 
112 |     secretKey: aws_access_key_id                                                
113 |   - envName: AWS_SECRET_ACCESS_KEY                                              
114 |     secretName: aws-credentials                                                 
115 |     secretKey: aws_secret_access_key   
116 | ```
117 | 
118 | And deploy the helm chart with `helm install` command shown above.
119 | 
120 | For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example:
121 | ```python
122 | from pyspark import SparkConf, SparkContext
123 | from pyspark.sql import functions as f
124 | from pyspark.sql import SparkSession
125 | 
126 | #spark configuration
127 | conf = (
128 |     SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
129 |     .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
130 |     .set("spark.hadoop.fs.s3a.fast.upload", True)
131 |     .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
132 |     .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0')
133 |     .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
134 | )
135 | sc=SparkContext(conf=conf).getOrCreate()
136 | 
137 | spark=SparkSession(sc)
138 | 
139 | df = spark.read.parquet("s3a:/<BUCKET-NAME>/<TABLE-NAME>/")
140 | 
141 | df.printSchema()
142 | ```
143 | 
144 | Make sure the credentials you passed as env variables do have access to the S3 bucket.
145 | 
146 | 


--------------------------------------------------------------------------------