├── .gitignore ├── cr.yaml ├── ct.yaml ├── charts └── pyspark-notebook │ ├── templates │ ├── serviceaccount.yaml │ ├── rolebinding.yaml │ ├── role.yaml │ ├── NOTES.txt │ ├── service.yaml │ ├── ingress.yaml │ ├── _helpers.tpl │ └── statefulset.yaml │ ├── .helmignore │ ├── Chart.yaml │ ├── values.yaml │ └── README.md ├── .github └── workflows │ ├── release.yml │ └── lint-test.yml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | 4 | node_modules/ 5 | lib/ 6 | 7 | logs/ 8 | *.log* 9 | -------------------------------------------------------------------------------- /cr.yaml: -------------------------------------------------------------------------------- 1 | # Set to true for GPG signing 2 | sign: false 3 | # UID of the GPG key to use 4 | key: Chart Releaser Test Key -------------------------------------------------------------------------------- /ct.yaml: -------------------------------------------------------------------------------- 1 | # See https://github.com/helm/chart-testing#configuration 2 | remote: origin 3 | target-branch: master 4 | chart-dirs: 5 | - charts 6 | chart-repos: 7 | - pyspark-notebook=https://a3data.github.io/pyspark-notebook-helm/ -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "pyspark.serviceAccountName" . }} 6 | labels: 7 | {{- include "pyspark.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- $fullName := include "pyspark.fullname" . -}} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: RoleBinding 4 | metadata: 5 | name: {{ $fullName | quote }} 6 | labels: 7 | {{- include "pyspark.labels" . | nindent 4 }} 8 | subjects: 9 | - kind: ServiceAccount 10 | name: {{ include "pyspark.serviceAccountName" . }} 11 | namespace: {{ .Release.Namespace | quote }} 12 | roleRef: 13 | kind: Role 14 | name: {{ $fullName | quote }} 15 | apiGroup: rbac.authorization.k8s.io 16 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Charts 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Configure Git 18 | run: | 19 | git config user.name "$GITHUB_ACTOR" 20 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 21 | 22 | - name: Run chart-releaser 23 | uses: helm/chart-releaser-action@v1.1.0 24 | env: 25 | CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" 26 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/role.yaml: -------------------------------------------------------------------------------- 1 | {{- $fullName := include "pyspark.fullname" . -}} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: {{ $fullName | quote }} 6 | labels: 7 | {{- include "pyspark.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: 10 | - "" 11 | resources: 12 | - pods 13 | verbs: 14 | - create 15 | - get 16 | - delete 17 | - list 18 | - watch 19 | - apiGroups: 20 | - "" 21 | resources: 22 | - services 23 | verbs: 24 | - get 25 | - create 26 | - apiGroups: 27 | - "" 28 | resources: 29 | - pods/log 30 | verbs: 31 | - get 32 | - list 33 | - apiGroups: 34 | - "" 35 | resources: 36 | - pods/exec 37 | verbs: 38 | - create 39 | - get 40 | - apiGroups: 41 | - "" 42 | resources: 43 | - configmaps 44 | verbs: 45 | - get 46 | - create 47 | - list 48 | - watch 49 | - delete 50 | -------------------------------------------------------------------------------- /.github/workflows/lint-test.yml: -------------------------------------------------------------------------------- 1 | name: Lint and Test Charts 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | lint-test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v2 11 | with: 12 | fetch-depth: 0 13 | 14 | - name: Set up Helm 15 | uses: azure/setup-helm@v1 16 | with: 17 | version: v3.4.1 18 | 19 | # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and 20 | # yamllint (https://github.com/adrienverge/yamllint) which require Python 21 | - name: Set up Python 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: 3.7 25 | 26 | - name: Set up chart-testing 27 | uses: helm/chart-testing-action@v2.0.1 28 | with: 29 | version: v3.3.0 30 | 31 | - name: Run chart-testing (lint) 32 | run: ct lint --config ct.yaml 33 | 34 | - name: Create kind cluster 35 | uses: helm/kind-action@v1.2.0 36 | 37 | - name: Run chart-testing (install) 38 | run: ct install --config ct.yaml -------------------------------------------------------------------------------- /charts/pyspark-notebook/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: pyspark-notebook 3 | description: A Kubernetes Helm chart for deploying Pyspark Notebook 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.2.2 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | maintainers: 26 | - email: marcusvoliveira@dcc.ufmg.br 27 | name: voidmarcus 28 | - email: maylateixeira2013@gmail.com 29 | name: maylatt 30 | - email: neylsoncrepalde@gmail.com 31 | name: neylsoncrepalde 32 | 33 | sources: 34 | - https://github.com/a3data/pyspark-notebook-helm 35 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if .Values.ingress.enabled }} 3 | {{- range $host := .Values.ingress.hosts }} 4 | {{- range .paths }} 5 | http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} 6 | {{- end }} 7 | {{- end }} 8 | {{- else if contains "NodePort" .Values.service.type }} 9 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "pyspark.fullname" . }}) 10 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 11 | echo http://$NODE_IP:$NODE_PORT 12 | {{- else if contains "LoadBalancer" .Values.service.type }} 13 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 14 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "pyspark.fullname" . }}' 15 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "pyspark.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 16 | echo http://$SERVICE_IP:{{ .Values.service.port }} 17 | {{- else if contains "ClusterIP" .Values.service.type }} 18 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "pyspark.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 19 | export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") 20 | echo "Visit http://127.0.0.1:8080 to use your application" 21 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT 22 | {{- end }} 23 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: {{ include "pyspark.fullname" . }} 6 | labels: 7 | {{- include "pyspark.labels" . | nindent 4 }} 8 | annotations: 9 | {{ toYaml .Values.service.annotations | indent 4 }} 10 | spec: 11 | type: {{ .Values.service.type }} 12 | selector: 13 | {{- include "pyspark.selectorLabels" . | nindent 4 }} 14 | ports: 15 | - name: {{ .Values.service.httpPortName | default "http" }} 16 | protocol: TCP 17 | port: {{ .Values.httpPort }} 18 | targetPort: http 19 | {{- if .Values.service.nodePort }} 20 | nodePort: {{ .Values.service.nodePort }} 21 | {{- end }} 22 | - name: {{ .Values.service.blockManagerPortName | default "blockmanager" }} 23 | protocol: TCP 24 | port: {{ .Values.blockManagerPort }} 25 | targetPort: blockmanager 26 | - name: {{ .Values.service.driverPortName | default "driver" }} 27 | protocol: TCP 28 | port: {{ .Values.driverPort }} 29 | targetPort: driver 30 | {{- if .Values.service.loadBalancerIP }} 31 | loadBalancerIP: {{ .Values.service.loadBalancerIP }} 32 | {{- end }} 33 | {{- with .Values.service.loadBalancerSourceRanges }} 34 | loadBalancerSourceRanges: 35 | {{ toYaml . | indent 4 }} 36 | {{- end }} 37 | {{- if .Values.service.externalTrafficPolicy }} 38 | externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy }} 39 | {{- end }} 40 | 41 | --- 42 | apiVersion: v1 43 | kind: Service 44 | metadata: 45 | name: {{ template "pyspark.fullname" . }}-headless 46 | labels: 47 | {{- include "pyspark.labels" . | nindent 4 }} 48 | annotations: 49 | service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" 50 | spec: 51 | clusterIP: None # This is needed for statefulset hostnames like pyspark-0 to resolve 52 | # Create endpoints also if the related pod isn't ready 53 | publishNotReadyAddresses: true 54 | selector: 55 | app: "{{ template "pyspark.fullname" . }}" 56 | ports: 57 | - name: {{ .Values.service.httpPortName | default "http" }} 58 | port: {{ .Values.httpPort }} 59 | - name: {{ .Values.service.blockManagerPortName | default "blockmanager" }} 60 | port: {{ .Values.blockManagerPort }} 61 | - name: {{ .Values.service.driverPortName | default "driver" }} 62 | port: {{ .Values.driverPort }} -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | {{- $fullName := include "pyspark.fullname" . -}} 3 | {{- $svcPort := .Values.service.port -}} 4 | {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} 5 | {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} 6 | {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} 7 | {{- end }} 8 | {{- end }} 9 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} 10 | apiVersion: networking.k8s.io/v1 11 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} 12 | apiVersion: networking.k8s.io/v1beta1 13 | {{- else -}} 14 | apiVersion: extensions/v1beta1 15 | {{- end }} 16 | kind: Ingress 17 | metadata: 18 | name: {{ $fullName }} 19 | labels: 20 | {{- include "pyspark.labels" . | nindent 4 }} 21 | {{- with .Values.ingress.annotations }} 22 | annotations: 23 | {{- toYaml . | nindent 4 }} 24 | {{- end }} 25 | spec: 26 | {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} 27 | ingressClassName: {{ .Values.ingress.className }} 28 | {{- end }} 29 | {{- if .Values.ingress.tls }} 30 | tls: 31 | {{- range .Values.ingress.tls }} 32 | - hosts: 33 | {{- range .hosts }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | secretName: {{ .secretName }} 37 | {{- end }} 38 | {{- end }} 39 | rules: 40 | {{- range .Values.ingress.hosts }} 41 | - host: {{ .host | quote }} 42 | http: 43 | paths: 44 | {{- range .paths }} 45 | - path: {{ .path }} 46 | {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} 47 | pathType: {{ .pathType }} 48 | {{- end }} 49 | backend: 50 | {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} 51 | service: 52 | name: {{ $fullName }} 53 | port: 54 | number: {{ $svcPort }} 55 | {{- else }} 56 | serviceName: {{ $fullName }} 57 | servicePort: {{ $svcPort }} 58 | {{- end }} 59 | {{- end }} 60 | {{- end }} 61 | {{- end }} 62 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "pyspark.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "pyspark.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "pyspark.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "pyspark.labels" -}} 37 | helm.sh/chart: {{ include "pyspark.chart" . }} 38 | {{ include "pyspark.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "pyspark.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "pyspark.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "pyspark.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "pyspark.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | 64 | {{/* 65 | Return the appropriate apiVersion for statefulset. 66 | */}} 67 | {{- define "pyspark.statefulset.apiVersion" -}} 68 | {{- if semverCompare "<1.9-0" .Capabilities.KubeVersion.GitVersion -}} 69 | {{- print "apps/v1beta2" -}} 70 | {{- else -}} 71 | {{- print "apps/v1" -}} 72 | {{- end -}} 73 | {{- end -}} 74 | 75 | {{/* 76 | Return custom environment variables 77 | */}} 78 | {{- define "pyspark.customEnvironment" }} 79 | # Dynamically created environment variables 80 | {{- range $i, $config := .Values.env }} 81 | - name: {{ $config.name }} 82 | value: {{ $config.value | quote }} 83 | {{- end }} 84 | # Dynamically created secret envs 85 | {{- range $i, $config := .Values.secret }} 86 | - name: {{ $config.envName }} 87 | valueFrom: 88 | secretKeyRef: 89 | name: {{ $config.secretName }} 90 | key: {{ default "value" $config.secretKey }} 91 | {{- end }} 92 | {{- end }} -------------------------------------------------------------------------------- /charts/pyspark-notebook/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for pyspark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | replicaCount: 1 6 | 7 | image: 8 | repository: jupyter/pyspark-notebook 9 | pullPolicy: IfNotPresent 10 | tag: "spark-3.1.2" 11 | 12 | imagePullSecrets: [] 13 | nameOverride: "" 14 | fullnameOverride: "" 15 | 16 | volumeClaimTemplate: 17 | accessModes: ["ReadWriteOnce"] 18 | volumeMode: Filesystem 19 | resources: 20 | requests: 21 | storage: 10Gi 22 | storageClassName: 23 | 24 | persistence: 25 | enabled: true 26 | labels: 27 | # Add default labels for the volumeClaimTemplate fo the StatefulSet 28 | enabled: false 29 | annotations: {} 30 | 31 | # Extra environment variables to append to pyspark container 32 | # This will be appended to the current 'env:' key. You can use any of the kubernetes env 33 | # syntax here 34 | env: [] 35 | # - name: GOOGLE_APPLICATION_CREDENTIALS 36 | # value: /mnt/secrets/key.json 37 | 38 | # Allows you to load environment variables from kubernetes secret 39 | secret: [] 40 | # - envName: AWS_ACCESS_KEY_ID 41 | # secretName: "" 42 | # secretKey: "" 43 | # - envName: AWS_SECRET_ACCESS_KEY 44 | # secretName: "" 45 | # secretKey: "" 46 | 47 | # Mount additional volumes into pyspark container. 48 | extraVolumes: [] 49 | # - name: secrets 50 | # secret: 51 | # secretName: gcp-credentials 52 | 53 | extraVolumeMounts: [] 54 | # - name: secrets 55 | # mountPath: "/mnt/secrets" 56 | # readOnly: true 57 | 58 | serviceAccount: 59 | # Specifies whether a service account should be created 60 | create: true 61 | # Annotations to add to the service account 62 | annotations: {} 63 | # The name of the service account to use. 64 | # If not set and create is true, a name is generated using the fullname template 65 | name: "" 66 | 67 | podAnnotations: {} 68 | 69 | disableToken: false 70 | 71 | podSecurityContext: 72 | fsGroup: 100 73 | runAsUser: 1000 74 | 75 | securityContext: 76 | capabilities: 77 | drop: 78 | - ALL 79 | # readOnlyRootFilesystem: true 80 | runAsNonRoot: true 81 | runAsUser: 1000 82 | 83 | # The default is to deploy all pods serially. By setting this to parallel all pods are started at 84 | # the same time when bootstrapping the cluster 85 | podManagementPolicy: "Parallel" 86 | 87 | httpPort: 8888 88 | blockManagerPort: 7777 89 | driverPort: 2222 90 | 91 | service: 92 | type: ClusterIP 93 | nodePort: "" 94 | annotations: {} 95 | httpPortName: http 96 | blockManagerPortName: blockmanager 97 | driverPortName: driver 98 | loadBalancerIP: "" 99 | loadBalancerSourceRanges: [] 100 | externalTrafficPolicy: "" 101 | 102 | updateStrategy: RollingUpdate 103 | 104 | # How long to wait for pyspark to stop gracefully 105 | terminationGracePeriod: 30 106 | 107 | ingress: 108 | enabled: false 109 | className: "" 110 | annotations: {} 111 | # kubernetes.io/ingress.class: nginx 112 | # kubernetes.io/tls-acme: "true" 113 | hosts: 114 | - host: chart-example.local 115 | paths: 116 | - path: / 117 | pathType: ImplementationSpecific 118 | tls: [] 119 | # - secretName: chart-example-tls 120 | # hosts: 121 | # - chart-example.local 122 | 123 | resources: 124 | requests: 125 | cpu: "1000m" 126 | memory: "2Gi" 127 | limits: 128 | cpu: "2000m" 129 | memory: "16Gi" 130 | 131 | nodeSelector: {} 132 | 133 | tolerations: [] 134 | 135 | affinity: {} 136 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/templates/statefulset.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: {{ template "pyspark.statefulset.apiVersion" . }} 3 | kind: StatefulSet 4 | metadata: 5 | name: {{ template "pyspark.fullname" . }} 6 | labels: 7 | {{- include "pyspark.labels" . | nindent 4 }} 8 | spec: 9 | serviceName: {{ template "pyspark.fullname" . }}-headless 10 | selector: 11 | matchLabels: 12 | {{- include "pyspark.selectorLabels" . | nindent 6 }} 13 | replicas: {{ .Values.replicaCount }} 14 | podManagementPolicy: {{ .Values.podManagementPolicy }} 15 | updateStrategy: 16 | type: {{ .Values.updateStrategy }} 17 | {{- if .Values.persistence.enabled }} 18 | volumeClaimTemplates: 19 | - metadata: 20 | name: {{ template "pyspark.fullname" . }}-notebooks 21 | {{- if .Values.persistence.labels.enabled }} 22 | labels: 23 | {{- include "pyspark.labels" . | nindent 8 }} 24 | {{- end }} 25 | {{- with .Values.persistence.annotations }} 26 | annotations: 27 | {{ toYaml . | indent 8 }} 28 | {{- end }} 29 | spec: 30 | {{ toYaml .Values.volumeClaimTemplate | indent 6 }} 31 | {{- end }} 32 | template: 33 | metadata: 34 | # name: "{{ template "pyspark.fullname" . }}" 35 | {{- with .Values.podAnnotations }} 36 | annotations: 37 | {{- toYaml . | nindent 8 }} 38 | {{- end }} 39 | labels: 40 | {{- include "pyspark.selectorLabels" . | nindent 8 }} 41 | spec: 42 | terminationGracePeriodSeconds: {{ .Values.terminationGracePeriod }} 43 | {{- with .Values.imagePullSecrets }} 44 | imagePullSecrets: 45 | {{- toYaml . | nindent 8 }} 46 | {{- end }} 47 | serviceAccountName: {{ include "pyspark.serviceAccountName" . }} 48 | securityContext: 49 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 50 | volumes: 51 | {{- if .Values.extraVolumes }} 52 | {{- toYaml .Values.extraVolumes | nindent 8 }} 53 | {{- end }} 54 | containers: 55 | - name: {{ .Chart.Name }} 56 | securityContext: 57 | {{- toYaml .Values.securityContext | nindent 12 }} 58 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 59 | imagePullPolicy: {{ .Values.image.pullPolicy }} 60 | command: ["start.sh"] 61 | args: 62 | - "jupyter" 63 | - "lab" 64 | {{- if .Values.disableToken }} 65 | - "--LabApp.token=''" 66 | {{- end }} 67 | ports: 68 | - name: http 69 | containerPort: 8888 70 | protocol: TCP 71 | - name: blockmanager 72 | containerPort: 7777 73 | protocol: TCP 74 | - name: driver 75 | containerPort: 2222 76 | protocol: TCP 77 | livenessProbe: 78 | httpGet: 79 | path: / 80 | port: http 81 | readinessProbe: 82 | httpGet: 83 | path: / 84 | port: http 85 | resources: 86 | {{- toYaml .Values.resources | nindent 12 }} 87 | volumeMounts: 88 | {{- if .Values.persistence.enabled }} 89 | - name: {{ template "pyspark.fullname" . }}-notebooks 90 | mountPath: /home/jovyan/work/ 91 | {{- end }} 92 | {{- if .Values.extraVolumeMounts }} 93 | {{- toYaml .Values.extraVolumeMounts | nindent 12 }} 94 | {{- end }} 95 | env: 96 | {{- include "pyspark.customEnvironment" . | indent 10 }} 97 | {{- with .Values.nodeSelector }} 98 | nodeSelector: 99 | {{- toYaml . | nindent 8 }} 100 | {{- end }} 101 | {{- with .Values.affinity }} 102 | affinity: 103 | {{- toYaml . | nindent 8 }} 104 | {{- end }} 105 | {{- with .Values.tolerations }} 106 | tolerations: 107 | {{- toYaml . | nindent 8 }} 108 | {{- end }} 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pyspark Notebook Helm Chart 2 | 3 | ## Introduction 4 | This repo provides 5 | the Kubernetes [Helm](https://helm.sh/) chart for deploying 6 | [Pyspark Notebook](https://hub.docker.com/r/jupyter/pyspark-notebook). 7 | 8 | ## Setup 9 | 1. Set up a kubernetes cluster 10 | - In a cloud platform of choice like [Amazon EKS](https://aws.amazon.com/eks), 11 | [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine), 12 | and [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service/) OR 13 | - In local environment using [Minikube](https://minikube.sigs.k8s.io/docs/). 14 | 2. Install the following tools: 15 | - [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources 16 | - [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. 17 | Note, we only support Helm 3. 18 | 19 | ## Quickstart 20 | 21 | Add pyspark-notebook helm repo by running the following 22 | 23 | ```(shell) 24 | helm repo add pyspark-notebook https://a3data.github.io/pyspark-notebook-helm/ 25 | ``` 26 | 27 | Then, deploy the pyspark-notebook by running the following 28 | 29 | ```(shell) 30 | helm install pyspark-notebook pyspark-notebook/pyspark-notebook 31 | ``` 32 | 33 | Run `kubectl get all` to check whether all the pyspark resources are running. You should get a result similar to below. 34 | 35 | ``` 36 | NAME READY STATUS RESTARTS AGE 37 | pod/pyspark-0 1/1 Running 0 9m18s 38 | 39 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 40 | service/pyspark ClusterIP 10.110.1.129 8888/TCP,7777/TCP,2222/TCP 9m18s 41 | service/pyspark-headless ClusterIP None 8888/TCP,7777/TCP,2222/TCP 9m18s 42 | 43 | NAME READY AGE 44 | statefulset.apps/pyspark 1/1 9m18s 45 | ``` 46 | 47 | You can run the following to expose the notebook locally. 48 | 49 | ```(shell) 50 | kubectl port-forward svc/ 8888:8888 51 | ``` 52 | 53 | You should be able to access the frontend via http://localhost:8888. 54 | 55 | ## Get Token 56 | 57 | ```(shell) 58 | kubectl exec -it pod/pyspark-0 -- bash 59 | jupyter server list 60 | ``` 61 | 62 | ## LoadBalancer 63 | 64 | ```sh 65 | helm install pyspark-notebook pyspark-notebook/pyspark-notebook --set service.type=LoadBalancer 66 | ``` 67 | 68 | 69 | ## GCP Example 70 | 71 | Create secret 72 | ```sh 73 | kubectl create secret generic gcs-credentials --from-file="./config/key.json" 74 | ``` 75 | Alter `values.yaml` 76 | 77 | ```yaml 78 | env: 79 | - name: GOOGLE_APPLICATION_CREDENTIALS 80 | value: /mnt/secrets/key.json 81 | 82 | extraVolumes: 83 | - name: secrets 84 | secret: 85 | secretName: gcp-credentials 86 | 87 | extraVolumeMounts: 88 | - name: secrets 89 | mountPath: "/mnt/secrets" 90 | readOnly: true 91 | ``` 92 | 93 | 94 | ## AWS Example 95 | 96 | Create secret from a `key.json` file. 97 | ```sh 98 | kubectl create secret generic aws-credentials --from-file="./config/key.json" 99 | ``` 100 | 101 | Or you can create a secret directly in the terminal: 102 | ```sh 103 | kubectl create secret generic aws-credentials --from-literal=aws_access_key_id= --from-literal=aws_secret_access_key= 104 | ``` 105 | 106 | Alter `values.yaml` to set your AWS credentials as environment variables 107 | ```yaml 108 | # Allows you to load environment variables from kubernetes secret 109 | secret: 110 | - envName: AWS_ACCESS_KEY_ID 111 | secretName: aws-credentials 112 | secretKey: aws_access_key_id 113 | - envName: AWS_SECRET_ACCESS_KEY 114 | secretName: aws-credentials 115 | secretKey: aws_secret_access_key 116 | ``` 117 | 118 | And deploy the helm chart with `helm install` command shown above. 119 | 120 | For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example: 121 | ```python 122 | from pyspark import SparkConf, SparkContext 123 | from pyspark.sql import functions as f 124 | from pyspark.sql import SparkSession 125 | 126 | #spark configuration 127 | conf = ( 128 | SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') 129 | .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') 130 | .set("spark.hadoop.fs.s3a.fast.upload", True) 131 | .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 132 | .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0') 133 | .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider') 134 | ) 135 | sc=SparkContext(conf=conf).getOrCreate() 136 | 137 | spark=SparkSession(sc) 138 | 139 | df = spark.read.parquet("s3a:///") 140 | 141 | df.printSchema() 142 | ``` 143 | 144 | Make sure the credentials you passed as env variables do have access to the S3 bucket. 145 | 146 | -------------------------------------------------------------------------------- /charts/pyspark-notebook/README.md: -------------------------------------------------------------------------------- 1 | # Pyspark Notebook Helm Chart 2 | 3 | ## Introduction 4 | This repo provides 5 | the Kubernetes [Helm](https://helm.sh/) chart for deploying 6 | [Pyspark Notebook](https://hub.docker.com/r/jupyter/pyspark-notebook). 7 | 8 | ## Setup 9 | 1. Set up a kubernetes cluster 10 | - In a cloud platform of choice like [Amazon EKS](https://aws.amazon.com/eks), 11 | [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine), 12 | and [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service/) OR 13 | - In local environment using [Minikube](https://minikube.sigs.k8s.io/docs/). 14 | 2. Install the following tools: 15 | - [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources 16 | - [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. 17 | Note, we only support Helm 3. 18 | 19 | ## Quickstart 20 | 21 | Clone the repository 22 | 23 | ```(shell) 24 | git clone https://github.com/A3Data/pyspark-helm.git 25 | ``` 26 | 27 | deploy Pyspark Notebook by running the following 28 | 29 | ```(shell) 30 | helm dependency update ./pyspark-helm/Chart.yaml 31 | helm install pyspark ./pyspark-helm/ --values ./pyspark-helm/values.yaml 32 | ``` 33 | 34 | Run `kubectl get all` to check whether all the pyspark resources are running. You should get a result similar to below. 35 | 36 | ``` 37 | NAME READY STATUS RESTARTS AGE 38 | pod/pyspark-0 1/1 Running 0 9m18s 39 | 40 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 41 | service/pyspark ClusterIP 10.110.1.129 8888/TCP,7777/TCP,2222/TCP 9m18s 42 | service/pyspark-headless ClusterIP None 8888/TCP,7777/TCP,2222/TCP 9m18s 43 | 44 | NAME READY AGE 45 | statefulset.apps/pyspark 1/1 9m18s 46 | ``` 47 | 48 | You can run the following to expose the notebook locally. 49 | 50 | ```(shell) 51 | kubectl port-forward svc/ 8888:8888 52 | ``` 53 | 54 | You should be able to access the frontend via http://localhost:8888. 55 | 56 | ## Get Token 57 | 58 | ```(shell) 59 | kubectl exec -it pod/pyspark-0 -- bash 60 | jupyter server list 61 | ``` 62 | 63 | ## LoadBalancer 64 | 65 | ```sh 66 | helm install pyspark ./pyspark-helm/ --values ./pyspark-helm/values.yaml --set service.type=LoadBalancer 67 | ``` 68 | 69 | ## GCP Example 70 | 71 | Create secret 72 | ```sh 73 | kubectl create secret generic gcs-credentials --from-file="./config/key.json" 74 | ``` 75 | Alter `values.yaml` 76 | 77 | ```yaml 78 | env: 79 | - name: GOOGLE_APPLICATION_CREDENTIALS 80 | value: /mnt/secrets/key.json 81 | 82 | extraVolumes: 83 | - name: secrets 84 | secret: 85 | secretName: gcp-credentials 86 | 87 | extraVolumeMounts: 88 | - name: secrets 89 | mountPath: "/mnt/secrets" 90 | readOnly: true 91 | ``` 92 | 93 | 94 | ## AWS Example 95 | 96 | Create secret from a `key.json` file. 97 | ```sh 98 | kubectl create secret generic aws-credentials --from-file="./config/key.json" 99 | ``` 100 | 101 | Or you can create a secret directly in the terminal: 102 | ```sh 103 | kubectl create secret generic aws-credentials --from-literal=aws_access_key_id= --from-literal=aws_secret_access_key= 104 | ``` 105 | 106 | Alter `values.yaml` to set your AWS credentials as environment variables 107 | ```yaml 108 | # Allows you to load environment variables from kubernetes secret 109 | secret: 110 | - envName: AWS_ACCESS_KEY_ID 111 | secretName: aws-credentials 112 | secretKey: aws_access_key_id 113 | - envName: AWS_SECRET_ACCESS_KEY 114 | secretName: aws-credentials 115 | secretKey: aws_secret_access_key 116 | ``` 117 | 118 | And deploy the helm chart with `helm install` command shown above. 119 | 120 | For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example: 121 | ```python 122 | from pyspark import SparkConf, SparkContext 123 | from pyspark.sql import functions as f 124 | from pyspark.sql import SparkSession 125 | 126 | #spark configuration 127 | conf = ( 128 | SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') 129 | .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') 130 | .set("spark.hadoop.fs.s3a.fast.upload", True) 131 | .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 132 | .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0') 133 | .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider') 134 | ) 135 | sc=SparkContext(conf=conf).getOrCreate() 136 | 137 | spark=SparkSession(sc) 138 | 139 | df = spark.read.parquet("s3a:///") 140 | 141 | df.printSchema() 142 | ``` 143 | 144 | Make sure the credentials you passed as env variables do have access to the S3 bucket. 145 | 146 | --------------------------------------------------------------------------------