├── .codecov.yml ├── .dockerignore ├── .github ├── CODEOWNERS └── workflows │ └── go.yml ├── .gitignore ├── .golangci.yml ├── .goreleaser.yml ├── Dockerfile ├── LICENSE ├── Makefile ├── PROJECT ├── README.md ├── api └── v1alpha1 │ ├── common_types.go │ ├── daskcluster_types.go │ ├── daskcluster_webhook.go │ ├── groupversion_info.go │ ├── mpicluster_types.go │ ├── mpicluster_webhook.go │ ├── raycluster_types.go │ ├── raycluster_webhook.go │ ├── raycluster_webhook_integration_test.go │ ├── sparkcluster_types.go │ ├── sparkcluster_webhook.go │ ├── sparkcluster_webhook_integration_test.go │ ├── validations.go │ ├── webhook_suite_test.go │ └── zz_generated.deepcopy.go ├── cluster-testing ├── dask.yaml └── ray.yaml ├── cmd ├── crdapply.go ├── crddelete.go ├── root.go └── start.go ├── config ├── certmanager │ ├── certificate.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── crd │ ├── bases │ │ ├── distributed-compute.dominodatalab.com_daskclusters.yaml │ │ ├── distributed-compute.dominodatalab.com_mpiclusters.yaml │ │ ├── distributed-compute.dominodatalab.com_rayclusters.yaml │ │ └── distributed-compute.dominodatalab.com_sparkclusters.yaml │ ├── embed.go │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ └── patches │ │ ├── cainjection_in_daskclusters.yaml │ │ ├── cainjection_in_mpiclusters.yaml │ │ ├── cainjection_in_rayclusters.yaml │ │ ├── cainjection_in_sparkclusters.yaml │ │ ├── webhook_in_daskclusters.yaml │ │ ├── webhook_in_mpiclusters.yaml │ │ ├── webhook_in_rayclusters.yaml │ │ └── webhook_in_sparkclusters.yaml ├── default │ ├── kustomization.yaml │ ├── manager_auth_proxy_patch.yaml │ ├── manager_config_patch.yaml │ ├── manager_webhook_patch.yaml │ └── webhookcainjection_patch.yaml ├── manager │ ├── controller_manager_config.yaml │ ├── kustomization.yaml │ └── manager.yaml ├── prometheus │ ├── kustomization.yaml │ └── monitor.yaml ├── rbac │ ├── auth_proxy_client_clusterrole.yaml │ ├── auth_proxy_role.yaml │ ├── auth_proxy_role_binding.yaml │ ├── auth_proxy_service.yaml │ ├── daskcluster_editor_role.yaml │ ├── daskcluster_viewer_role.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ ├── leader_election_role_binding.yaml │ ├── mpicluster_editor_role.yaml │ ├── mpicluster_viewer_role.yaml │ ├── raycluster_editor_role.yaml │ ├── raycluster_viewer_role.yaml │ ├── role.yaml │ └── role_binding.yaml ├── samples │ ├── distributed-compute_v1alpha1_daskcluster.yaml │ ├── distributed-compute_v1alpha1_mpicluster.yaml │ ├── distributed-compute_v1alpha1_raycluster.yaml │ └── distributed-compute_v1alpha1_sparkcluster.yaml └── webhook │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ ├── manifests.yaml │ └── service.yaml ├── controllers ├── config.go ├── controllers.go ├── daskcluster_controller.go ├── mpicluster_controller.go ├── raycluster_controller.go ├── raycluster_controller_integration_test.go ├── sparkcluster_controller.go ├── sparkcluster_controller_integration_test.go ├── suite_test.go └── variables.go ├── deploy └── helm │ └── distributed-compute-operator │ ├── .helmignore │ ├── Chart.lock │ ├── Chart.yaml │ ├── charts │ └── common-1.4.1.tgz │ ├── dco-values.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── clusterrole.yaml │ ├── clusterrolebinding.yaml │ ├── deployment.yaml │ ├── hooks.yaml │ ├── istio.yaml │ ├── networkpolicy.yaml │ ├── serviceaccount.yaml │ ├── webhook-cert-manager.yaml │ ├── webhook-configuration-mutating.yaml │ ├── webhook-configuration-validating.yaml │ └── webhook-service.yaml │ └── values.yaml ├── dockerfiles ├── mpi-init.Dockerfile ├── mpi-sync.Dockerfile ├── mpi-worker-start.sh ├── openssh.gpgkey ├── rsync-start.sh └── rsyncd.conf ├── docs ├── development.md └── img │ └── logo.png ├── go.mod ├── go.sum ├── hack └── boilerplate.go.txt ├── istio ├── global-strict-mtls.yaml └── operator-minimal.yaml ├── main.go ├── pkg ├── cluster │ ├── dask │ │ ├── clientports.go │ │ ├── clusterstatusupdate.go │ │ ├── configmap.go │ │ ├── dask_test.go │ │ ├── horizonalpodautoscaler.go │ │ ├── horizontalpodautoscaler_test.go │ │ ├── istiopeerauthentication.go │ │ ├── metadata.go │ │ ├── networkpolicy.go │ │ ├── networkpolicy_test.go │ │ ├── rbac.go │ │ ├── rbac_test.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── serviceaccount.go │ │ ├── serviceaccount_test.go │ │ └── statefulset.go │ ├── metadata │ │ └── metadata.go │ └── mpi │ │ ├── clientports.go │ │ ├── configmap.go │ │ ├── istiopeerauthentication.go │ │ ├── metadata.go │ │ ├── mpi.go │ │ ├── networkpolicy.go │ │ ├── podsecuritypolicy.go │ │ ├── service.go │ │ ├── serviceaccount.go │ │ ├── statefulset.go │ │ └── statusupdate.go ├── controller │ ├── actions │ │ └── actions.go │ ├── components │ │ ├── clientports.go │ │ ├── clusterstatusupdate.go │ │ ├── configmap.go │ │ ├── horizontalpodautoscaler.go │ │ ├── istiopeerauthentication.go │ │ ├── networkpolicy.go │ │ ├── rbac.go │ │ ├── service.go │ │ ├── serviceaccount.go │ │ └── statefulset.go │ └── core │ │ ├── components.go │ │ ├── context.go │ │ ├── patch.go │ │ └── reconciler.go ├── crd │ ├── crd.go │ ├── crd_test.go │ └── istio.go ├── manager │ └── manager.go ├── resources │ ├── istio │ │ ├── peerauthentication.go │ │ └── peerauthentication_test.go │ ├── metadata.go │ ├── metadata_test.go │ ├── ray │ │ ├── helpers_test.go │ │ ├── horizontalpodautoscaler.go │ │ ├── horizontalpodautoscaler_test.go │ │ ├── networkpolicy.go │ │ ├── networkpolicy_test.go │ │ ├── podsecuritypolicy.go │ │ ├── podsecuritypolicy_test.go │ │ ├── ray.go │ │ ├── ray_test.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── serviceaccount.go │ │ ├── serviceaccount_test.go │ │ ├── statefulset.go │ │ └── statefulset_test.go │ └── spark │ │ ├── configmap.go │ │ ├── configmap_test.go │ │ ├── envoyfilter.go │ │ ├── envoyfilter_test.go │ │ ├── helpers_test.go │ │ ├── horizontalpodautoscaler.go │ │ ├── horizontalpodautoscaler_test.go │ │ ├── networkpolicy.go │ │ ├── networkpolicy_test.go │ │ ├── podsecuritypolicy.go │ │ ├── podsecuritypolicy_test.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── serviceaccount.go │ │ ├── serviceaccount_test.go │ │ ├── spark.go │ │ ├── spark_test.go │ │ ├── statefulset.go │ │ └── statefulset_test.go └── util │ ├── util.go │ └── util_test.go ├── scripts ├── development.sh ├── hotpatch.sh └── release │ ├── before-hook.sh │ └── helm.sh └── test └── test.go /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: 75% 6 | threshold: 10% 7 | patch: off 8 | ignore: 9 | - "api/**/zz_generated.deepcopy.go" 10 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore all files which are not go type 3 | !**/*.go 4 | !**/*.mod 5 | !**/*.sum 6 | 7 | # Ignore bin directories 8 | bin/* 9 | testbin/* 10 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @dominodatalab/workbench-train 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin 9 | testbin/* 10 | 11 | # Test binary, build with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Kubernetes Generated files - skip generated files, except for vendored files 18 | 19 | !vendor/**/zz_generated.* 20 | 21 | # editor and IDE paraphernalia 22 | .idea 23 | *.swp 24 | *.swo 25 | *~ 26 | 27 | # goreleaser directories 28 | dist/ 29 | custom-resource-definitions/ 30 | 31 | # miscellaneous local files 32 | exclude/ 33 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | timeout: 2m 3 | 4 | linters-settings: 5 | dupl: 6 | threshold: 100 7 | exhaustive: 8 | default-signifies-exhaustive: true 9 | funlen: 10 | lines: 150 11 | statements: 70 12 | gci: 13 | local-prefixes: github.com/dominodatalab/distributed-compute-operator 14 | goconst: 15 | min-len: 2 16 | min-occurrences: 2 17 | gocyclo: 18 | min-complexity: 25 19 | goimports: 20 | local-prefixes: github.com/dominodatalab/distributed-compute-operator 21 | gomnd: 22 | settings: 23 | mnd: 24 | checks: argument,case,condition,return 25 | ignored-functions: log.V 26 | govet: 27 | check-shadowing: true 28 | lll: 29 | line-length: 140 30 | maligned: 31 | suggest-new: true 32 | misspell: 33 | locale: US 34 | nestif: 35 | min-complexity: 6 36 | 37 | linters: 38 | disable-all: true 39 | enable: 40 | - bodyclose 41 | - depguard 42 | - dogsled 43 | - dupl 44 | - errcheck 45 | - errorlint 46 | - exhaustive 47 | - funlen 48 | - goconst 49 | - gocritic 50 | - gocyclo 51 | - gofmt 52 | - goimports 53 | - revive 54 | - gomnd 55 | - goprintffuncname 56 | - gosec 57 | - gosimple 58 | - govet 59 | - ineffassign 60 | - lll 61 | - misspell 62 | - nakedret 63 | - nestif 64 | - noctx 65 | - exportloopref 66 | - staticcheck 67 | - stylecheck 68 | - typecheck 69 | - unconvert 70 | - unparam 71 | - unused 72 | - whitespace 73 | 74 | issues: 75 | exclude-rules: 76 | - path: _test\.go 77 | linters: 78 | - dupl 79 | - exhaustive 80 | - gocyclo 81 | - gomnd 82 | - gosec 83 | - funlen 84 | - path: test/test.go 85 | linters: 86 | - dogsled 87 | - source: "^//\\s*\\+kubebuilder:.+" 88 | linters: 89 | - lll 90 | exclude: 91 | - Using the variable on range scope `tc` in function literal 92 | 93 | -------------------------------------------------------------------------------- /.goreleaser.yml: -------------------------------------------------------------------------------- 1 | before: 2 | hooks: 3 | - scripts/release/before-hook.sh 4 | builds: 5 | - env: 6 | - CGO_ENABLED=0 7 | goos: 8 | - linux 9 | - darwin 10 | goarch: 11 | - amd64 12 | archives: 13 | - replacements: 14 | amd64: x86_64 15 | files: 16 | - LICENSE 17 | - README.md 18 | - custom-resource-definitions/*.yaml 19 | - deploy/* 20 | checksum: 21 | name_template: 'checksums.txt' 22 | snapshot: 23 | name_template: "{{ .Tag }}-next" 24 | changelog: 25 | sort: asc 26 | filters: 27 | exclude: 28 | - '^docs:' 29 | - '^test:' 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM golang:1.21.3 as builder 3 | 4 | WORKDIR /workspace 5 | # Copy the Go Modules manifests 6 | COPY go.mod go.mod 7 | COPY go.sum go.sum 8 | # cache deps before building and copying source so that we don't need to re-download as much 9 | # and so that source changes don't invalidate our downloaded layer 10 | RUN go mod download 11 | 12 | # Copy the go source 13 | COPY main.go main.go 14 | COPY cmd/ cmd/ 15 | COPY api/ api/ 16 | COPY config/crd/ config/crd/ 17 | COPY controllers/ controllers/ 18 | COPY pkg/ pkg/ 19 | 20 | # Build 21 | RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -a -o manager main.go 22 | 23 | # Use distroless as minimal base image to package the manager binary 24 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 25 | FROM gcr.io/distroless/static-debian11:nonroot 26 | WORKDIR / 27 | COPY --from=builder /workspace/manager . 28 | USER 65532:65532 29 | 30 | ENTRYPOINT ["/manager"] 31 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | domain: dominodatalab.com 2 | layout: 3 | - go.kubebuilder.io/v3 4 | projectName: distributed-compute-operator 5 | repo: github.com/dominodatalab/distributed-compute-operator 6 | resources: 7 | - api: 8 | crdVersion: v1 9 | namespaced: true 10 | controller: true 11 | domain: dominodatalab.com 12 | group: distributed-compute 13 | kind: RayCluster 14 | path: github.com/dominodatalab/distributed-compute-operator/api/v1alpha1 15 | version: v1alpha1 16 | webhooks: 17 | defaulting: true 18 | validation: true 19 | webhookVersion: v1 20 | - api: 21 | crdVersion: v1 22 | namespaced: true 23 | controller: true 24 | domain: dominodatalab.com 25 | group: distributed-compute 26 | kind: DaskCluster 27 | path: github.com/dominodatalab/distributed-compute-operator/api/v1alpha1 28 | version: v1alpha1 29 | webhooks: 30 | defaulting: true 31 | validation: true 32 | webhookVersion: v1 33 | - api: 34 | crdVersion: v1 35 | namespaced: true 36 | controller: true 37 | domain: dominodatalab.com 38 | group: distributed-compute 39 | kind: SparkCluster 40 | path: github.com/dominodatalab/distributed-compute-operator/api/v1alpha1 41 | version: v1alpha1 42 | webhooks: 43 | defaulting: true 44 | validation: true 45 | webhookVersion: v1 46 | - api: 47 | crdVersion: v1 48 | namespaced: true 49 | controller: true 50 | domain: dominodatalab.com 51 | group: distributed-compute 52 | kind: MPICluster 53 | path: github.com/dominodatalab/distributed-compute-operator/api/v1alpha1 54 | version: v1alpha1 55 | webhooks: 56 | defaulting: true 57 | validation: true 58 | webhookVersion: v1 59 | version: "3" 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Logo 3 |

4 |

5 | 6 | GitHub release 7 | 8 | 9 | Go workflow 10 | 11 | 12 | Go report card 13 | 14 | 15 | Codecov 16 | 17 | 18 | PkgGoDev 19 | 20 | 21 | License 22 | 23 |

24 | 25 | # IMPORTANT 26 | 27 | This repository has now been archived. No further work will be merged here. 28 | 29 | # Distributed Compute Operator 30 | 31 | Kubernetes operator providing Ray|Spark|Dask clusters on-demand via [Custom Resource Definitions][custom resources]. 32 | 33 | ## Overview 34 | 35 | TODO 36 | 37 | ## Installation 38 | 39 | The easiest way to install DCO is to use the provided Helm chart. 40 | 41 | ### Prerequisites 42 | 43 | Before you get started using the DCO, you need to have a running Kubernetes cluster. 44 | 45 | - Access to a Kubernetes cluster version **1.16** or above. This version of the 46 | CRD API is stable and supports our required features. 47 | - Install [helm] client version **3.0.0** or above. 48 | - Install the [cert-manager] operator. DCO makes extensive use of [webhooks] 49 | which require TLS. 50 | 51 | ### Install 52 | 53 | ```shell 54 | $ helm install distributed-compute-operator ./deploy/helm/distributed-compute-operator 55 | ``` 56 | 57 | ## Development 58 | 59 | The following instructions will help you create a local Kubernetes environment 60 | that can be used to test every feature supported by this operator. 61 | 62 | 1. Install [minikube] and create a new cluster. 63 | 64 | ```shell 65 | # tested using minikube v1.17.1 and k8s v1.21.3 66 | $ minikube start \ 67 | --cpus=6 --memory=16384 --driver=hyperkit \ 68 | --extra-config=apiserver.enable-admission-plugins=PodSecurityPolicy \ 69 | --addons=pod-security-policy 70 | ``` 71 | 72 | 1. Install cert-manager 73 | 1. Install metrics-server 74 | 1. Launch operator 75 | 76 | [custom resources]: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/ 77 | [helm]: https://helm.sh/docs/intro/install/ 78 | [cert-manager]: https://cert-manager.io/docs/ 79 | [webhooks]: https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/ 80 | [minikube]: https://minikube.sigs.k8s.io/docs/ 81 | -------------------------------------------------------------------------------- /api/v1alpha1/daskcluster_types.go: -------------------------------------------------------------------------------- 1 | package v1alpha1 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | ) 7 | 8 | // DaskClusterWorker defines worker-specific workload settings. 9 | type DaskClusterWorker struct { 10 | WorkloadConfig `json:",inline"` 11 | Replicas *int32 `json:"replicas,omitempty"` 12 | } 13 | 14 | // DaskClusterSpec defines the desired state of DaskCluster. 15 | type DaskClusterSpec struct { 16 | ScalableClusterConfig `json:",inline"` 17 | 18 | Scheduler WorkloadConfig `json:"scheduler,omitempty"` 19 | Worker DaskClusterWorker `json:"worker,omitempty"` 20 | 21 | SchedulerPort int32 `json:"schedulerPort,omitempty"` 22 | DashboardPort int32 `json:"dashboardPort,omitempty"` 23 | WorkerPort int32 `json:"workerPort,omitempty"` 24 | NannyPort int32 `json:"nannyPort,omitempty"` 25 | 26 | // AdditionalClientPorts are extra ports through which cluster nodes could connect to the client. 27 | AdditionalClientPorts []corev1.ServicePort `json:"additionalClientPorts,omitempty"` 28 | } 29 | 30 | // DaskClusterStatus defines the observed state of DaskCluster 31 | type DaskClusterStatus struct { 32 | ClusterStatusConfig `json:",inline"` 33 | } 34 | 35 | //+kubebuilder:object:root=true 36 | //+kubebuilder:resource:shortName=dask 37 | //+kubebuilder:subresource:status 38 | //+kubebuilder:subresource:scale:specpath=.spec.worker.replicas,statuspath=.status.workerReplicas,selectorpath=.status.workerSelector 39 | //+kubebuilder:printcolumn:name="Workers",type=integer,JSONPath=".spec.worker.replicas" 40 | //+kubebuilder:printcolumn:name="Status",type=string,JSONPath=".status.clusterStatus" 41 | //+kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" 42 | //+kubebuilder:printcolumn:name="Image",type=string,JSONPath=".spec.image" 43 | //+kubebuilder:printcolumn:name="Network Policy",type=boolean,JSONPath=".spec.networkPolicy.enabled",priority=10 44 | //+kubebuilder:printcolumn:name="Pods",type=string,JSONPath=".status.nodes",priority=10 45 | 46 | // DaskCluster is the Schema for the daskclusters API. 47 | type DaskCluster struct { 48 | metav1.TypeMeta `json:",inline"` 49 | metav1.ObjectMeta `json:"metadata,omitempty"` 50 | 51 | Spec DaskClusterSpec `json:"spec,omitempty"` 52 | Status DaskClusterStatus `json:"status,omitempty"` 53 | } 54 | 55 | //+kubebuilder:object:root=true 56 | 57 | // DaskClusterList contains a list of DaskCluster. 58 | type DaskClusterList struct { 59 | metav1.TypeMeta `json:",inline"` 60 | metav1.ListMeta `json:"metadata,omitempty"` 61 | Items []DaskCluster `json:"items"` 62 | } 63 | 64 | func init() { 65 | SchemeBuilder.Register(&DaskCluster{}, &DaskClusterList{}) 66 | } 67 | -------------------------------------------------------------------------------- /api/v1alpha1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | // Package v1alpha1 contains API Schema definitions for the distributed-compute v1alpha1 API group 2 | // +kubebuilder:object:generate=true 3 | // +groupName=distributed-compute.dominodatalab.com 4 | package v1alpha1 5 | 6 | import ( 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | "sigs.k8s.io/controller-runtime/pkg/scheme" 9 | ) 10 | 11 | var ( 12 | // GroupVersion is group version used to register these objects 13 | GroupVersion = schema.GroupVersion{Group: "distributed-compute.dominodatalab.com", Version: "v1alpha1"} 14 | 15 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 16 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 17 | 18 | // AddToScheme adds the types in this group-version to the given scheme. 19 | AddToScheme = SchemeBuilder.AddToScheme 20 | ) 21 | -------------------------------------------------------------------------------- /api/v1alpha1/mpicluster_types.go: -------------------------------------------------------------------------------- 1 | package v1alpha1 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | ) 7 | 8 | // MPIClusterWorker defines worker-specific workload settings. 9 | type MPIClusterWorker struct { 10 | WorkloadConfig `json:",inline"` 11 | Replicas *int32 `json:"replicas,omitempty"` 12 | SharedSSHSecret string `json:"sharedSSHSecret"` 13 | UserName string `json:"userName,omitempty"` 14 | UserID *int64 `json:"userID,omitempty"` 15 | GroupName string `json:"groupName,omitempty"` 16 | GroupID *int64 `json:"groupID,omitempty"` 17 | HomeDir string `json:"homeDir,omitempty"` 18 | } 19 | 20 | // MPIClusterSpec defines the desired state of MPICluster. 21 | type MPIClusterSpec struct { 22 | ClusterConfig `json:",inline"` 23 | Worker MPIClusterWorker `json:"worker,omitempty"` 24 | 25 | // WorkerPorts specifies the range of ports used by worker processes for communication. 26 | WorkerPorts []int32 `json:"workerPorts,omitempty"` 27 | // AdditionalClientPorts are extra ports through which cluster nodes could connect to the client. 28 | AdditionalClientPorts []corev1.ServicePort `json:"additionalClientPorts,omitempty"` 29 | } 30 | 31 | //+kubebuilder:object:root=true 32 | //+kubebuilder:resource:shortName=mpi 33 | //+kubebuilder:subresource:status 34 | //+kubebuilder:printcolumn:name="Workers",type=integer,JSONPath=".spec.worker.replicas" 35 | //+kubebuilder:printcolumn:name="Status",type=string,JSONPath=".status.clusterStatus" 36 | //+kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" 37 | //+kubebuilder:printcolumn:name="Image",type=string,JSONPath=".status.image",priority=10 38 | //+kubebuilder:printcolumn:name="Bound PSP",type=string,JSONPath=".spec.podSecurityPolicy",priority=10 39 | //+kubebuilder:printcolumn:name="Network Policy",type=boolean,JSONPath=".spec.networkPolicy.enabled",priority=10 40 | //+kubebuilder:printcolumn:name="Pods",type=string,JSONPath=".status.nodes",priority=10 41 | 42 | // MPICluster is the Schema for the MPI Clusters API. 43 | type MPICluster struct { 44 | metav1.TypeMeta `json:",inline"` 45 | metav1.ObjectMeta `json:"metadata,omitempty"` 46 | Spec MPIClusterSpec `json:"spec,omitempty"` 47 | Status ClusterStatusConfig `json:"status,omitempty"` 48 | } 49 | 50 | //+kubebuilder:object:root=true 51 | 52 | // MPIClusterList contains a list of MPICluster. 53 | type MPIClusterList struct { 54 | metav1.TypeMeta `json:",inline"` 55 | metav1.ListMeta `json:"metadata,omitempty"` 56 | Items []MPICluster `json:"items"` 57 | } 58 | 59 | func init() { 60 | SchemeBuilder.Register(&MPICluster{}, &MPIClusterList{}) 61 | } 62 | -------------------------------------------------------------------------------- /cluster-testing/dask.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: dask-notebook 5 | 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1 8 | kind: Role 9 | metadata: 10 | name: dask-notebook 11 | rules: 12 | - apiGroups: ['policy'] 13 | resources: ['podsecuritypolicies'] 14 | verbs: ['use'] 15 | resourceNames: 16 | - privileged 17 | 18 | --- 19 | apiVersion: rbac.authorization.k8s.io/v1 20 | kind: RoleBinding 21 | metadata: 22 | name: dask-notebook 23 | roleRef: 24 | kind: Role 25 | name: dask-notebook 26 | apiGroup: rbac.authorization.k8s.io 27 | subjects: 28 | - kind: ServiceAccount 29 | name: dask-notebook 30 | 31 | --- 32 | apiVersion: apps/v1 33 | kind: Deployment 34 | metadata: 35 | name: dask-notebook 36 | labels: 37 | app: dask-notebook 38 | dask-client: "true" 39 | spec: 40 | replicas: 1 41 | selector: 42 | matchLabels: 43 | app: dask-notebook 44 | template: 45 | metadata: 46 | labels: 47 | app: dask-notebook 48 | dask-client: "true" 49 | spec: 50 | serviceAccountName: dask-notebook 51 | containers: 52 | - name: dask-notebook 53 | image: daskdev/dask-notebook:2021.7.2 54 | ports: 55 | - containerPort: 8888 56 | 57 | --- 58 | apiVersion: v1 59 | kind: Service 60 | metadata: 61 | name: dask-notebook 62 | spec: 63 | type: NodePort 64 | selector: 65 | app: dask-notebook 66 | ports: 67 | - protocol: TCP 68 | name: tcp-ui 69 | port: 8888 70 | targetPort: 8888 71 | -------------------------------------------------------------------------------- /cluster-testing/ray.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: test-ray 5 | 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1 8 | kind: Role 9 | metadata: 10 | name: test-ray 11 | rules: 12 | - apiGroups: ["policy"] 13 | resources: ["podsecuritypolicies"] 14 | verbs: ["use"] 15 | resourceNames: 16 | - privileged 17 | 18 | --- 19 | apiVersion: rbac.authorization.k8s.io/v1 20 | kind: RoleBinding 21 | metadata: 22 | name: test-ray 23 | roleRef: 24 | kind: Role 25 | name: test-ray 26 | apiGroup: rbac.authorization.k8s.io 27 | subjects: 28 | - kind: ServiceAccount 29 | name: test-ray 30 | 31 | --- 32 | apiVersion: apps/v1 33 | kind: Deployment 34 | metadata: 35 | name: test-ray 36 | labels: 37 | app: test-ray 38 | version: 1.6.0-cpu 39 | ray-client: "true" 40 | spec: 41 | replicas: 1 42 | selector: 43 | matchLabels: 44 | app: test-ray 45 | template: 46 | metadata: 47 | labels: 48 | app: test-ray 49 | version: 1.6.0-cpu 50 | ray-client: "true" 51 | spec: 52 | serviceAccountName: test-ray 53 | containers: 54 | - name: ray 55 | image: rayproject/ray:1.6.0-cpu 56 | command: ["sleep", "86400"] 57 | -------------------------------------------------------------------------------- /cmd/crdapply.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/spf13/cobra" 7 | 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/crd" 9 | ) 10 | 11 | var crdApplyCmd = &cobra.Command{ 12 | Use: "crd-apply", 13 | Short: "Apply custom resource definitions to a cluster", 14 | Long: `Apply all "distributed-compute.dominodatalab.com" CRDs to a cluster. 15 | 16 | Apply Rules: 17 | - When a definition is is missing, it will be created 18 | - If a definition is already present, then it will be updated 19 | - Updating definitions that have not changed results in a no-op`, 20 | RunE: func(cmd *cobra.Command, args []string) error { 21 | return crd.Apply(context.Background(), istioEnabled) 22 | }, 23 | } 24 | 25 | func init() { 26 | rootCmd.AddCommand(crdApplyCmd) 27 | } 28 | -------------------------------------------------------------------------------- /cmd/crddelete.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/spf13/cobra" 7 | 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/crd" 9 | ) 10 | 11 | var crdDeleteCmd = &cobra.Command{ 12 | Use: "crd-delete", 13 | Short: "Delete custom resource definitions from a cluster", 14 | Long: `Delete all "distributed-compute.dominodatalab.com" CRDs from a cluster. 15 | 16 | Any running distributed compute resources will be decommissioned when this 17 | operation runs (i.e. your deployments will be deleted immediately). This will 18 | only attempt to remove definitions that are already present in Kubernetes.`, 19 | RunE: func(cmd *cobra.Command, args []string) error { 20 | return crd.Delete(context.Background(), istioEnabled) 21 | }, 22 | } 23 | 24 | func init() { 25 | rootCmd.AddCommand(crdDeleteCmd) 26 | } 27 | -------------------------------------------------------------------------------- /cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/spf13/cobra" 8 | ) 9 | 10 | var istioEnabled bool 11 | 12 | var rootCmd = &cobra.Command{ 13 | Use: "distributed-compute-operator", 14 | Short: "Kubernetes operator that manages parallel computing clusters.", 15 | Long: `Kubernetes operator that manages parallel computing clusters.`, 16 | } 17 | 18 | // Execute launches the command line tool. 19 | func Execute() { 20 | if err := rootCmd.Execute(); err != nil { 21 | fmt.Println(err) 22 | os.Exit(1) 23 | } 24 | } 25 | 26 | func init() { 27 | // NOTE: required until https://github.com/spf13/cobra/issues/587 28 | rootCmd.SetHelpCommand(&cobra.Command{Hidden: true}) 29 | rootCmd.PersistentFlags().BoolVar(&istioEnabled, "istio-enabled", false, "Enable support for Istio sidecar container") 30 | } 31 | -------------------------------------------------------------------------------- /cmd/start.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "flag" 5 | 6 | "github.com/dominodatalab/distributed-compute-operator/controllers" 7 | 8 | "github.com/spf13/cobra" 9 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 10 | 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/manager" 12 | ) 13 | 14 | const WebhookPort = 9443 15 | 16 | var ( 17 | namespaces []string 18 | probeAddr string 19 | metricsAddr string 20 | webhookPort int 21 | enableLeaderElection bool 22 | zapOpts = zap.Options{} 23 | mpiInitImage string 24 | mpiSyncImage string 25 | ) 26 | 27 | var startCmd = &cobra.Command{ 28 | Use: "start", 29 | Short: "Start the controller manager", 30 | RunE: func(cmd *cobra.Command, args []string) error { 31 | cfg := &controllers.Config{ 32 | Namespaces: namespaces, 33 | MetricsAddr: metricsAddr, 34 | HealthProbeAddr: probeAddr, 35 | WebhookServerPort: webhookPort, 36 | EnableLeaderElection: enableLeaderElection, 37 | IstioEnabled: istioEnabled, 38 | ZapOptions: zapOpts, 39 | MPIInitImage: mpiInitImage, 40 | MPISyncImage: mpiSyncImage, 41 | } 42 | 43 | return manager.Start(cfg) 44 | }, 45 | } 46 | 47 | func init() { 48 | startCmd.Flags().SortFlags = false 49 | 50 | fs := new(flag.FlagSet) 51 | zapOpts.BindFlags(fs) 52 | 53 | startCmd.Flags().AddGoFlagSet(fs) 54 | startCmd.Flags().StringSliceVar(&namespaces, "namespaces", nil, 55 | "Only reconcile resources in these namespaces") 56 | startCmd.Flags().IntVar(&webhookPort, "webhook-server-port", WebhookPort, 57 | "Webhook server will bind to this port") 58 | startCmd.Flags().StringVar(&metricsAddr, "metrics-bind-address", ":8080", 59 | "Metrics endpoint will bind to this address") 60 | startCmd.Flags().StringVar(&probeAddr, "health-probe-bind-address", ":8081", 61 | "Health probe endpoint will bind to this address") 62 | startCmd.Flags().BoolVar(&enableLeaderElection, "leader-elect", false, 63 | "Enable leader election to ensure there is only one active controller manager") 64 | startCmd.Flags().StringVar(&mpiInitImage, "mpi-init-image", "", 65 | "Image for MPI worker init container") 66 | startCmd.Flags().StringVar(&mpiSyncImage, "mpi-sync-image", "", 67 | "Image for MPI worker sync container") 68 | 69 | rootCmd.AddCommand(startCmd) 70 | } 71 | -------------------------------------------------------------------------------- /config/certmanager/certificate.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. 4 | apiVersion: cert-manager.io/v1 5 | kind: Issuer 6 | metadata: 7 | name: selfsigned-issuer 8 | namespace: system 9 | spec: 10 | selfSigned: {} 11 | --- 12 | apiVersion: cert-manager.io/v1 13 | kind: Certificate 14 | metadata: 15 | name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml 16 | namespace: system 17 | spec: 18 | # $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize 19 | dnsNames: 20 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc 21 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local 22 | issuerRef: 23 | kind: Issuer 24 | name: selfsigned-issuer 25 | secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize 26 | -------------------------------------------------------------------------------- /config/certmanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - certificate.yaml 3 | 4 | configurations: 5 | - kustomizeconfig.yaml 6 | -------------------------------------------------------------------------------- /config/certmanager/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is for teaching kustomize how to update name ref and var substitution 2 | nameReference: 3 | - kind: Issuer 4 | group: cert-manager.io 5 | fieldSpecs: 6 | - kind: Certificate 7 | group: cert-manager.io 8 | path: spec/issuerRef/name 9 | 10 | varReference: 11 | - kind: Certificate 12 | group: cert-manager.io 13 | path: spec/commonName 14 | - kind: Certificate 15 | group: cert-manager.io 16 | path: spec/dnsNames 17 | -------------------------------------------------------------------------------- /config/crd/embed.go: -------------------------------------------------------------------------------- 1 | package crd 2 | 3 | import ( 4 | "embed" 5 | "path/filepath" 6 | ) 7 | 8 | // NOTE: If we start using conversion webhooks in the future and need to 9 | // "patch" our CRD bases with `kustomize', we can (1) pre-process CRDs during 10 | // build time and store them in "config/crd/processed", (2) git ignore that 11 | // directory, (3) and embed that directory instead of "bases". 12 | 13 | //go:embed bases/*.yaml 14 | var bases embed.FS 15 | 16 | const contentDir = "bases" 17 | 18 | // Definition represents the metadata and contents of a single custom resource definition. 19 | type Definition struct { 20 | Filename string 21 | Contents []byte 22 | } 23 | 24 | // ReadAll returns a slice of custom resource Definition objects. 25 | func ReadAll() (definitions []Definition, err error) { 26 | files, err := bases.ReadDir(contentDir) 27 | if err != nil { 28 | return 29 | } 30 | 31 | for _, f := range files { 32 | if f.IsDir() { 33 | continue 34 | } 35 | 36 | var contents []byte 37 | contents, err = bases.ReadFile(filepath.Join(contentDir, f.Name())) 38 | if err != nil { 39 | return 40 | } 41 | 42 | definitions = append(definitions, Definition{ 43 | Filename: f.Name(), 44 | Contents: contents, 45 | }) 46 | } 47 | 48 | return definitions, nil 49 | } 50 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/distributed-compute.dominodatalab.com_rayclusters.yaml 6 | - bases/distributed-compute.dominodatalab.com_sparkclusters.yaml 7 | - bases/distributed-compute.dominodatalab.com_daskclusters.yaml 8 | - bases/distributed-compute.dominodatalab.com_mpiclusters.yaml 9 | #+kubebuilder:scaffold:crdkustomizeresource 10 | 11 | patchesStrategicMerge: 12 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 13 | # patches here are for enabling the conversion webhook for each CRD 14 | - patches/webhook_in_rayclusters.yaml 15 | - patches/webhook_in_sparkclusters.yaml 16 | - patches/webhook_in_daskclusters.yaml 17 | #- patches/webhook_in_mpiclusters.yaml 18 | #+kubebuilder:scaffold:crdkustomizewebhookpatch 19 | 20 | # [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix. 21 | # patches here are for enabling the CA injection for each CRD 22 | #- patches/cainjection_in_rayclusters.yaml 23 | #- patches/cainjection_in_sparkclusters.yaml 24 | #- patches/cainjection_in_daskclusters.yaml 25 | #- patches/cainjection_in_mpiclusters.yaml 26 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch 27 | 28 | # the following config is for teaching kustomize how to do kustomization for CRDs. 29 | configurations: 30 | - kustomizeconfig.yaml 31 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /config/crd/patches/cainjection_in_daskclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 7 | name: daskclusters.distributed-compute.dominodatalab.com 8 | -------------------------------------------------------------------------------- /config/crd/patches/cainjection_in_mpiclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 7 | name: mpiclusters.distributed-compute.dominodatalab.com 8 | -------------------------------------------------------------------------------- /config/crd/patches/cainjection_in_rayclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 7 | name: rayclusters.distributed-compute.dominodatalab.com 8 | -------------------------------------------------------------------------------- /config/crd/patches/cainjection_in_sparkclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 7 | name: sparkclusters.distributed-compute.dominodatalab.com 8 | -------------------------------------------------------------------------------- /config/crd/patches/webhook_in_daskclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: daskclusters.distributed-compute.dominodatalab.com 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | conversionReviewVersions: ["v1","v1beta1"] 11 | clientConfig: 12 | service: 13 | namespace: system 14 | name: webhook-service 15 | path: /convert 16 | -------------------------------------------------------------------------------- /config/crd/patches/webhook_in_mpiclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: mpiclusters.distributed-compute.dominodatalab.com 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | clientConfig: 11 | service: 12 | namespace: system 13 | name: webhook-service 14 | path: /convert 15 | conversionReviewVersions: 16 | - v1 17 | -------------------------------------------------------------------------------- /config/crd/patches/webhook_in_rayclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: rayclusters.distributed-compute.dominodatalab.com 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | conversionReviewVersions: ["v1","v1beta1"] 11 | clientConfig: 12 | service: 13 | namespace: system 14 | name: webhook-service 15 | path: /convert 16 | -------------------------------------------------------------------------------- /config/crd/patches/webhook_in_sparkclusters.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: sparkclusters.distributed-compute.dominodatalab.com 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | conversionReviewVersions: ["v1","v1beta1"] 11 | clientConfig: 12 | service: 13 | namespace: system 14 | name: webhook-service 15 | path: /convert 16 | -------------------------------------------------------------------------------- /config/default/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: distributed-compute-operator-system 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | namePrefix: distributed-compute-operator- 10 | 11 | # Labels to add to all resources and selectors. 12 | #commonLabels: 13 | # someName: someValue 14 | 15 | bases: 16 | - ../crd 17 | - ../rbac 18 | - ../manager 19 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 20 | # crd/kustomization.yaml 21 | - ../webhook 22 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. 23 | - ../certmanager 24 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. 25 | #- ../prometheus 26 | 27 | patchesStrategicMerge: 28 | # Protect the /metrics endpoint by putting it behind auth. 29 | # If you want your controller-manager to expose the /metrics 30 | # endpoint w/o any authn/z, please comment the following line. 31 | #- manager_auth_proxy_patch.yaml 32 | 33 | # Mount the controller config file for loading manager configurations 34 | # through a ComponentConfig type 35 | #- manager_config_patch.yaml 36 | 37 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 38 | # crd/kustomization.yaml 39 | - manager_webhook_patch.yaml 40 | 41 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 42 | # Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. 43 | # 'CERTMANAGER' needs to be enabled to use ca injection 44 | - webhookcainjection_patch.yaml 45 | 46 | # the following config is for teaching kustomize how to do var substitution 47 | vars: 48 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. 49 | - name: CERTIFICATE_NAMESPACE # namespace of the certificate CR 50 | objref: 51 | kind: Certificate 52 | group: cert-manager.io 53 | version: v1 54 | name: serving-cert # this name should match the one in certificate.yaml 55 | fieldref: 56 | fieldpath: metadata.namespace 57 | - name: CERTIFICATE_NAME 58 | objref: 59 | kind: Certificate 60 | group: cert-manager.io 61 | version: v1 62 | name: serving-cert # this name should match the one in certificate.yaml 63 | - name: SERVICE_NAMESPACE # namespace of the service 64 | objref: 65 | kind: Service 66 | version: v1 67 | name: webhook-service 68 | fieldref: 69 | fieldpath: metadata.namespace 70 | - name: SERVICE_NAME 71 | objref: 72 | kind: Service 73 | version: v1 74 | name: webhook-service 75 | -------------------------------------------------------------------------------- /config/default/manager_auth_proxy_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: controller-manager 7 | namespace: system 8 | spec: 9 | template: 10 | spec: 11 | containers: 12 | - name: kube-rbac-proxy 13 | image: gcr.io/kubebuilder/kube-rbac-proxy:v0.8.0 14 | args: 15 | - "--secure-listen-address=0.0.0.0:8443" 16 | - "--upstream=http://127.0.0.1:8080/" 17 | - "--logtostderr=true" 18 | - "--v=10" 19 | ports: 20 | - containerPort: 8443 21 | name: https 22 | - name: manager 23 | args: 24 | - "--health-probe-bind-address=:8081" 25 | - "--metrics-bind-address=127.0.0.1:8080" 26 | - "--leader-elect" 27 | -------------------------------------------------------------------------------- /config/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | args: 12 | - "--config=controller_manager_config.yaml" 13 | volumeMounts: 14 | - name: manager-config 15 | mountPath: /controller_manager_config.yaml 16 | subPath: controller_manager_config.yaml 17 | volumes: 18 | - name: manager-config 19 | configMap: 20 | name: manager-config 21 | -------------------------------------------------------------------------------- /config/default/manager_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | ports: 12 | - containerPort: 9443 13 | name: webhook-server 14 | protocol: TCP 15 | volumeMounts: 16 | - mountPath: /tmp/k8s-webhook-server/serving-certs 17 | name: cert 18 | readOnly: true 19 | volumes: 20 | - name: cert 21 | secret: 22 | defaultMode: 420 23 | secretName: webhook-server-cert 24 | -------------------------------------------------------------------------------- /config/default/webhookcainjection_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch add annotation to admission webhook config and 2 | # the variables $(CERTIFICATE_NAMESPACE) and $(CERTIFICATE_NAME) will be substituted by kustomize. 3 | apiVersion: admissionregistration.k8s.io/v1 4 | kind: MutatingWebhookConfiguration 5 | metadata: 6 | name: mutating-webhook-configuration 7 | annotations: 8 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 9 | --- 10 | apiVersion: admissionregistration.k8s.io/v1 11 | kind: ValidatingWebhookConfiguration 12 | metadata: 13 | name: validating-webhook-configuration 14 | annotations: 15 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 16 | -------------------------------------------------------------------------------- /config/manager/controller_manager_config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 2 | kind: ControllerManagerConfig 3 | health: 4 | healthProbeBindAddress: :8081 5 | metrics: 6 | bindAddress: 127.0.0.1:8080 7 | webhook: 8 | port: 9443 9 | leaderElection: 10 | leaderElect: true 11 | resourceName: a846cbf2.dominodatalab.com 12 | -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | 4 | generatorOptions: 5 | disableNameSuffixHash: true 6 | 7 | configMapGenerator: 8 | - files: 9 | - controller_manager_config.yaml 10 | name: manager-config 11 | apiVersion: kustomize.config.k8s.io/v1beta1 12 | kind: Kustomization 13 | images: 14 | - name: controller 15 | newName: ghcr.io/dominodatalab/distributed-compute-operator 16 | newTag: latest 17 | -------------------------------------------------------------------------------- /config/manager/manager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | name: system 7 | --- 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | metadata: 11 | name: controller-manager 12 | namespace: system 13 | labels: 14 | control-plane: controller-manager 15 | spec: 16 | selector: 17 | matchLabels: 18 | control-plane: controller-manager 19 | replicas: 1 20 | template: 21 | metadata: 22 | labels: 23 | control-plane: controller-manager 24 | spec: 25 | securityContext: 26 | runAsNonRoot: true 27 | containers: 28 | - command: 29 | - /manager 30 | - start 31 | args: 32 | - --leader-elect 33 | - --zap-log-level=5 34 | image: controller:latest 35 | imagePullPolicy: IfNotPresent # changed to aid development 36 | name: manager 37 | securityContext: 38 | allowPrivilegeEscalation: false 39 | livenessProbe: 40 | httpGet: 41 | path: /healthz 42 | port: 8081 43 | initialDelaySeconds: 15 44 | periodSeconds: 20 45 | readinessProbe: 46 | httpGet: 47 | path: /readyz 48 | port: 8081 49 | initialDelaySeconds: 5 50 | periodSeconds: 10 51 | resources: 52 | limits: 53 | cpu: 100m 54 | memory: 30Mi 55 | requests: 56 | cpu: 100m 57 | memory: 20Mi 58 | terminationGracePeriodSeconds: 10 59 | -------------------------------------------------------------------------------- /config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Prometheus Monitor Service (Metrics) 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: ServiceMonitor 5 | metadata: 6 | labels: 7 | control-plane: controller-manager 8 | name: controller-manager-metrics-monitor 9 | namespace: system 10 | spec: 11 | endpoints: 12 | - path: /metrics 13 | port: https 14 | selector: 15 | matchLabels: 16 | control-plane: controller-manager 17 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-reader 5 | rules: 6 | - nonResourceURLs: ["/metrics"] 7 | verbs: ["get"] 8 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: proxy-role 5 | rules: 6 | - apiGroups: ["authentication.k8s.io"] 7 | resources: 8 | - tokenreviews 9 | verbs: ["create"] 10 | - apiGroups: ["authorization.k8s.io"] 11 | resources: 12 | - subjectaccessreviews 13 | verbs: ["create"] 14 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: proxy-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: proxy-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | name: controller-manager-metrics-service 7 | namespace: system 8 | spec: 9 | ports: 10 | - name: https 11 | port: 8443 12 | targetPort: https 13 | selector: 14 | control-plane: controller-manager 15 | -------------------------------------------------------------------------------- /config/rbac/daskcluster_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit daskclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: daskcluster-editor-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - daskclusters 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - distributed-compute.dominodatalab.com 21 | resources: 22 | - daskclusters/status 23 | verbs: 24 | - get 25 | -------------------------------------------------------------------------------- /config/rbac/daskcluster_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view daskclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: daskcluster-viewer-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - daskclusters 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - distributed-compute.dominodatalab.com 17 | resources: 18 | - daskclusters/status 19 | verbs: 20 | - get 21 | -------------------------------------------------------------------------------- /config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | - role_binding.yaml 4 | - leader_election_role.yaml 5 | - leader_election_role_binding.yaml 6 | # Comment the following 4 lines if you want to disable 7 | # the auth proxy (https://github.com/brancz/kube-rbac-proxy) 8 | # which protects your /metrics endpoint. 9 | - auth_proxy_service.yaml 10 | - auth_proxy_role.yaml 11 | - auth_proxy_role_binding.yaml 12 | - auth_proxy_client_clusterrole.yaml 13 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: leader-election-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | - coordination.k8s.io 10 | resources: 11 | - configmaps 12 | - leases 13 | verbs: 14 | - get 15 | - list 16 | - watch 17 | - create 18 | - update 19 | - patch 20 | - delete 21 | - apiGroups: 22 | - "" 23 | resources: 24 | - events 25 | verbs: 26 | - create 27 | - patch 28 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: leader-election-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: leader-election-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/mpicluster_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit mpiclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: mpicluster-editor-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - mpiclusters 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - distributed-compute.dominodatalab.com 21 | resources: 22 | - mpiclusters/status 23 | verbs: 24 | - get 25 | -------------------------------------------------------------------------------- /config/rbac/mpicluster_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view mpiclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: mpicluster-viewer-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - mpiclusters 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - distributed-compute.dominodatalab.com 17 | resources: 18 | - mpiclusters/status 19 | verbs: 20 | - get 21 | -------------------------------------------------------------------------------- /config/rbac/raycluster_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit rayclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: raycluster-editor-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - rayclusters 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - distributed-compute.dominodatalab.com 21 | resources: 22 | - rayclusters/status 23 | verbs: 24 | - get 25 | -------------------------------------------------------------------------------- /config/rbac/raycluster_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view rayclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: raycluster-viewer-role 6 | rules: 7 | - apiGroups: 8 | - distributed-compute.dominodatalab.com 9 | resources: 10 | - rayclusters 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - distributed-compute.dominodatalab.com 17 | resources: 18 | - rayclusters/status 19 | verbs: 20 | - get 21 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | creationTimestamp: null 6 | name: manager-role 7 | rules: 8 | - apiGroups: 9 | - "" 10 | resources: 11 | - pods 12 | verbs: 13 | - list 14 | - watch 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - serviceaccounts 19 | - services 20 | verbs: 21 | - create 22 | - list 23 | - update 24 | - watch 25 | - apiGroups: 26 | - apps 27 | resources: 28 | - configmaps 29 | verbs: 30 | - create 31 | - list 32 | - update 33 | - watch 34 | - apiGroups: 35 | - apps 36 | resources: 37 | - statefulsets 38 | verbs: 39 | - create 40 | - list 41 | - update 42 | - watch 43 | - apiGroups: 44 | - autoscaling 45 | resources: 46 | - horizontalpodautoscalers 47 | verbs: 48 | - create 49 | - delete 50 | - list 51 | - update 52 | - watch 53 | - apiGroups: 54 | - distributed-compute.dominodatalab.com 55 | resources: 56 | - daskclusters 57 | verbs: 58 | - create 59 | - delete 60 | - get 61 | - list 62 | - patch 63 | - update 64 | - watch 65 | - apiGroups: 66 | - distributed-compute.dominodatalab.com 67 | resources: 68 | - daskclusters/finalizers 69 | verbs: 70 | - update 71 | - apiGroups: 72 | - distributed-compute.dominodatalab.com 73 | resources: 74 | - daskclusters/status 75 | verbs: 76 | - get 77 | - patch 78 | - update 79 | - apiGroups: 80 | - distributed-compute.dominodatalab.com 81 | resources: 82 | - mpiclusters 83 | verbs: 84 | - create 85 | - delete 86 | - get 87 | - list 88 | - patch 89 | - update 90 | - watch 91 | - apiGroups: 92 | - distributed-compute.dominodatalab.com 93 | resources: 94 | - mpiclusters/finalizers 95 | verbs: 96 | - update 97 | - apiGroups: 98 | - distributed-compute.dominodatalab.com 99 | resources: 100 | - mpiclusters/status 101 | verbs: 102 | - get 103 | - patch 104 | - update 105 | - apiGroups: 106 | - distributed-compute.dominodatalab.com 107 | resources: 108 | - rayclusters 109 | verbs: 110 | - create 111 | - delete 112 | - get 113 | - list 114 | - patch 115 | - update 116 | - watch 117 | - apiGroups: 118 | - distributed-compute.dominodatalab.com 119 | resources: 120 | - rayclusters/finalizers 121 | verbs: 122 | - update 123 | - apiGroups: 124 | - distributed-compute.dominodatalab.com 125 | resources: 126 | - rayclusters/status 127 | verbs: 128 | - get 129 | - patch 130 | - update 131 | - apiGroups: 132 | - distributed-compute.dominodatalab.com 133 | resources: 134 | - sparkclusters 135 | verbs: 136 | - create 137 | - delete 138 | - get 139 | - list 140 | - patch 141 | - update 142 | - watch 143 | - apiGroups: 144 | - distributed-compute.dominodatalab.com 145 | resources: 146 | - sparkclusters/finalizers 147 | verbs: 148 | - update 149 | - apiGroups: 150 | - distributed-compute.dominodatalab.com 151 | resources: 152 | - sparkclusters/status 153 | verbs: 154 | - get 155 | - patch 156 | - update 157 | - apiGroups: 158 | - networking.k8s.io 159 | resources: 160 | - networkpolicies 161 | verbs: 162 | - create 163 | - delete 164 | - list 165 | - update 166 | - watch 167 | - apiGroups: 168 | - rbac.authorization.k8s.io 169 | resources: 170 | - rolebindings 171 | - roles 172 | verbs: 173 | - create 174 | - delete 175 | - list 176 | - update 177 | - watch 178 | -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: manager-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: manager-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/samples/distributed-compute_v1alpha1_daskcluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: distributed-compute.dominodatalab.com/v1alpha1 2 | kind: DaskCluster 3 | metadata: 4 | name: example 5 | spec: 6 | # schedulerPort: 8786 7 | # dashboardPort: 8787 8 | # workerPort: 3000 9 | # nannyPort: 3001 10 | 11 | # additionalClientPorts: 12 | # - name: http-api-proxy 13 | # port: 8899 14 | # targetPort: 8899 15 | # protocol: TCP 16 | 17 | # image: 18 | # registry: "" 19 | # repository: daskdev/dask 20 | # tag: 2021.6.1 21 | # pullPolicy: IfNotPresent 22 | 23 | # autoscaling: 24 | # minReplicas: 25 | # maxReplicas: 26 | # averageCPUUtilization: 27 | # averageMemoryUtilization: 28 | # scaleDownStabilizationWindowSeconds: 29 | 30 | # networkPolicy: 31 | # enabled: true 32 | # clientLabels: {} 33 | # dashboardLabels: {} 34 | # dashboardNamespaceLabels: {} 35 | 36 | # serviceAccount: 37 | # name: "" 38 | # automountServiceAccountToken: false 39 | 40 | # podSecurityContext: 41 | # runAsUser: 42 | # runAsGroup: 43 | # fsGroup: 44 | 45 | # kerberosKeytab: 46 | # contents: 47 | # mountPath: 48 | 49 | # globalLabels: {} 50 | # envVars: [] 51 | # imagePullSecrets: [] 52 | # podSecurityPolicy: "" 53 | # istioMutualTLSMode: "" 54 | 55 | scheduler: 56 | # labels: {} 57 | # annotations: {} 58 | # nodeSelector: {} 59 | # affinity: {} 60 | # tolerations: [] 61 | # initContainers: [] 62 | # volumes: [] 63 | # volumeMounts: [] 64 | # volumeClaimTemplates: [] 65 | # resources: {} 66 | 67 | worker: 68 | # replicas: 1 69 | # labels: {} 70 | # annotations: {} 71 | # nodeSelector: {} 72 | # affinity: {} 73 | # tolerations: [] 74 | # initContainers: [] 75 | # volumes: [] 76 | # volumeMounts: [] 77 | # volumeClaimTemplates: [] 78 | resources: 79 | requests: 80 | cpu: 250m 81 | memory: 250Mi 82 | -------------------------------------------------------------------------------- /config/samples/distributed-compute_v1alpha1_mpicluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: distributed-compute.dominodatalab.com/v1alpha1 2 | kind: MPICluster 3 | metadata: 4 | name: example 5 | spec: 6 | # image: 7 | # registry: "" 8 | # repository: horovod/horovod 9 | # tag: 0.22.1 10 | # pullPolicy: IfNotPresent 11 | 12 | # networkPolicy: 13 | # enabled: true 14 | # clientLabels: {} 15 | # dashboardLabels: {} 16 | 17 | # serviceAccount: 18 | # name: "" 19 | # automountServiceAccountToken: false 20 | 21 | # podSecurityContext: 22 | # runAsUser: 23 | # runAsGroup: 24 | # fsGroup: 25 | 26 | # kerberosKeytab: 27 | # contents: 28 | # mountPath: 29 | 30 | # globalLabels: {} 31 | # envVars: [] 32 | # imagePullSecrets: [] 33 | # podSecurityPolicy: "" 34 | # istioMutualTLSMode: "" 35 | 36 | # additionalClientPorts: 37 | # - name: http-api-proxy 38 | # port: 8899 39 | # targetPort: 8899 40 | # protocol: TCP 41 | 42 | worker: 43 | # replicas: 1 44 | sharedSSHSecret: "" 45 | # userName: 46 | # userID: 47 | # groupName: 48 | # groupID: 49 | # homeDir: /mnt 50 | # labels: {} 51 | # annotations: {} 52 | # nodeSelector: {} 53 | # affinity: {} 54 | # tolerations: [] 55 | # initContainers: [] 56 | # volumes: [] 57 | # volumeMounts: [] 58 | # volumeClaimTemplates: [] 59 | # resources: {} 60 | -------------------------------------------------------------------------------- /config/samples/distributed-compute_v1alpha1_raycluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: distributed-compute.dominodatalab.com/v1alpha1 2 | kind: RayCluster 3 | metadata: 4 | name: example 5 | spec: 6 | # redis port and addition redis shard ports used by head node 7 | # port: 1234 8 | # redisShardPorts: 9 | # - 6380 10 | # - 6381 11 | 12 | # additionalClientPorts: 13 | # - name: http-api-proxy 14 | # port: 8899 15 | # targetPort: 8899 16 | # protocol: TCP 17 | 18 | # port used to connect clients to ray 19 | # clientServerPort: 10001 20 | 21 | # enable dashboard ui and set port 22 | # dashboardPort: 8265 23 | # enableDashboard: true 24 | 25 | # set the object store's port and initial memory 26 | # objectManagerPort: 2384 27 | # objectStoreMemoryBytes: 52428800 28 | 29 | # internal port overrides 30 | # nodeManagerPort: 2385 31 | # gcsServerPort: 2386 32 | # workerPorts: 33 | # - 12000 34 | # - 12001 35 | # - 12002 36 | # - 12003 37 | # - 12004 38 | 39 | # image: 40 | # registry: "" 41 | # repository: rayproject/ray 42 | # tag: nightly 43 | # pullPolicy: IfNotPresent 44 | 45 | # autoscaling: 46 | # minReplicas: 47 | # maxReplicas: 48 | # averageCPUUtilization: 49 | # averageMemoryUtilization: 50 | # scaleDownStabilizationWindowSeconds: 51 | 52 | # networkPolicy: 53 | # enabled: true 54 | # clientLabels: {} 55 | # dashboardLabels: {} 56 | # dashboardNamespaceLabels: {} 57 | 58 | # serviceAccount: 59 | # name: "" 60 | # automountServiceAccountToken: false 61 | 62 | # podSecurityContext: 63 | # runAsUser: 64 | # runAsGroup: 65 | # fsGroup: 66 | 67 | # kerberosKeytab: 68 | # contents: 69 | # mountPath: 70 | 71 | # globalLabels: {} 72 | # envVars: [] 73 | # imagePullSecrets: [] 74 | # podSecurityPolicy: "" 75 | # istioMutualTLSMode: "" 76 | 77 | head: 78 | # labels: {} 79 | # annotations: {} 80 | # nodeSelector: {} 81 | # affinity: {} 82 | # tolerations: [] 83 | # initContainers: [] 84 | # volumes: [] 85 | # volumeMounts: [] 86 | # volumeClaimTemplates: [] 87 | # resources: {} 88 | 89 | worker: 90 | # replicas: 2 91 | # labels: {} 92 | # annotations: {} 93 | # nodeSelector: {} 94 | # affinity: {} 95 | # tolerations: [] 96 | # initContainers: [] 97 | # volumes: [] 98 | # volumeMounts: [] 99 | # volumeClaimTemplates: [] 100 | resources: 101 | requests: 102 | cpu: 100m 103 | memory: 250Mi 104 | -------------------------------------------------------------------------------- /config/samples/distributed-compute_v1alpha1_sparkcluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: distributed-compute.dominodatalab.com/v1alpha1 2 | kind: SparkCluster 3 | metadata: 4 | name: example 5 | spec: 6 | workerMemoryLimit: 100m 7 | 8 | # envoyFilterLabels: {} 9 | # clusterPort: 7077 10 | # masterWebPort: 8080 11 | # workerWebPort: 8081 12 | 13 | # additionalClientPorts: 14 | # - name: http-api-proxy 15 | # port: 8899 16 | # targetPort: 8899 17 | # protocol: TCP 18 | 19 | # image: 20 | # registry: "" 21 | # repository: bitnami/spark 22 | # tag: 3.0.2-debian-10-r0 23 | # pullPolicy: IfNotPresent 24 | 25 | # autoscaling: 26 | # minReplicas: 27 | # maxReplicas: 28 | # averageCPUUtilization: 29 | # averageMemoryUtilization: 30 | # scaleDownStabilizationWindowSeconds: 31 | 32 | # networkPolicy: 33 | # enabled: true 34 | # clientLabels: {} 35 | # dashboardLabels: {} 36 | 37 | # serviceAccount: 38 | # name: "" 39 | # automountServiceAccountToken: false 40 | 41 | # podSecurityContext: 42 | # runAsUser: 43 | # runAsGroup: 44 | # fsGroup: 45 | 46 | # kerberosKeytab: 47 | # contents: 48 | # mountPath: 49 | 50 | # globalLabels: {} 51 | # envVars: [] 52 | # imagePullSecrets: [] 53 | # podSecurityPolicy: "" 54 | # istioMutualTLSMode: "" 55 | 56 | master: 57 | # defaultConfiguration: 58 | # spark.driver.host: "driver-service.ns.svc.cluster.local" 59 | # spark.executor.cores: "4" 60 | # spark.executor.instances: "1" 61 | # spark.executor.memory: 15360m 62 | # spark.ui.proxyBase: "/master/proxy/url" 63 | # spark.ui.reverseProxy: "true" 64 | # spark.ui.reverseProxyUrl: "/master/proxy/base" 65 | # labels: {} 66 | # annotations: {} 67 | # nodeSelector: {} 68 | # affinity: {} 69 | # tolerations: [] 70 | # initContainers: [] 71 | # volumes: [] 72 | # volumeMounts: [] 73 | # volumeClaimTemplates: [] 74 | resources: 75 | requests: 76 | cpu: 100m 77 | memory: 250Mi 78 | 79 | worker: 80 | # defaultConfiguration: 81 | # spark.driver.host: "driver-svc.ns.svc.cluster.local" 82 | # spark.executor.cores: "4" 83 | # spark.executor.instances: "1" 84 | # spark.executor.memory: 15360m 85 | # spark.ui.proxyBase: "/worker/proxy/base" 86 | # spark.ui.reverseProxy: "true" 87 | # spark.ui.reverseProxyUrl: "/worker/proxy/url" 88 | # replicas: 1 89 | # labels: {} 90 | # annotations: {} 91 | # nodeSelector: {} 92 | # affinity: {} 93 | # tolerations: [] 94 | # initContainers: [] 95 | # volumes: [] 96 | # volumeMounts: [] 97 | # volumeClaimTemplates: [] 98 | resources: 99 | requests: 100 | cpu: 1 101 | memory: 250Mi 102 | 103 | driver: 104 | # port: 4041 105 | # uiPort: 4040 106 | # blockManagerPort: 4042 107 | selector: 108 | app.kubernetes.io/instance: driver-pod 109 | -------------------------------------------------------------------------------- /config/webhook/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manifests.yaml 3 | - service.yaml 4 | 5 | configurations: 6 | - kustomizeconfig.yaml 7 | -------------------------------------------------------------------------------- /config/webhook/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # the following config is for teaching kustomize where to look at when substituting vars. 2 | # It requires kustomize v2.1.0 or newer to work properly. 3 | nameReference: 4 | - kind: Service 5 | version: v1 6 | fieldSpecs: 7 | - kind: MutatingWebhookConfiguration 8 | group: admissionregistration.k8s.io 9 | path: webhooks/clientConfig/service/name 10 | - kind: ValidatingWebhookConfiguration 11 | group: admissionregistration.k8s.io 12 | path: webhooks/clientConfig/service/name 13 | 14 | namespace: 15 | - kind: MutatingWebhookConfiguration 16 | group: admissionregistration.k8s.io 17 | path: webhooks/clientConfig/service/namespace 18 | create: true 19 | - kind: ValidatingWebhookConfiguration 20 | group: admissionregistration.k8s.io 21 | path: webhooks/clientConfig/service/namespace 22 | create: true 23 | 24 | varReference: 25 | - path: metadata/annotations 26 | -------------------------------------------------------------------------------- /config/webhook/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: webhook-service 6 | namespace: system 7 | spec: 8 | ports: 9 | - port: 443 10 | targetPort: 9443 11 | selector: 12 | control-plane: controller-manager 13 | -------------------------------------------------------------------------------- /controllers/config.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import "sigs.k8s.io/controller-runtime/pkg/log/zap" 4 | 5 | // Config options for the controller manager. 6 | type Config struct { 7 | Namespaces []string 8 | MetricsAddr string 9 | HealthProbeAddr string 10 | WebhookServerPort int 11 | EnableLeaderElection bool 12 | IstioEnabled bool 13 | ZapOptions zap.Options 14 | MPIInitImage string 15 | MPISyncImage string 16 | } 17 | -------------------------------------------------------------------------------- /controllers/controllers.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | ctrl "sigs.k8s.io/controller-runtime" 5 | ) 6 | 7 | type Builder func(manager ctrl.Manager, webhooksEnabled bool, cfg *Config) error 8 | 9 | var BuilderFuncs = []Builder{ 10 | DaskCluster, 11 | MPICluster, 12 | } 13 | -------------------------------------------------------------------------------- /controllers/daskcluster_controller.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | ctrl "sigs.k8s.io/controller-runtime" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/dask" 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 9 | ) 10 | 11 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=daskclusters,verbs=get;list;watch;create;update;patch;delete 12 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=daskclusters/status,verbs=get;update;patch 13 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=daskclusters/finalizers,verbs=update 14 | 15 | func DaskCluster(mgr ctrl.Manager, webhooksEnabled bool, cfg *Config) error { 16 | reconciler := core.NewReconciler(mgr). 17 | For(&dcv1alpha1.DaskCluster{}). 18 | Component("istio-peerauthentication", dask.IstioPeerAuthentication(cfg.IstioEnabled)). 19 | Component("serviceaccount", dask.ServiceAccount()). 20 | Component("configmap-keytab", dask.ConfigMapKeyTab()). 21 | Component("role-podsecuritypolicy", dask.RolePodSecurityPolicy()). 22 | Component("rolebinding-podsecuritypolicy", dask.RoleBindingPodSecurityPolicy()). 23 | Component("service-scheduler", dask.ServiceScheduler()). 24 | Component("service-worker", dask.ServiceWorker()). 25 | Component("service-proxy", dask.ClientPortsService()). 26 | Component("networkpolicy-scheduler", dask.NetworkPolicyScheduler()). 27 | Component("networkpolicy-worker", dask.NetworkPolicyWorker()). 28 | Component("networkpolicy-proxy", dask.ClientPortsNetworkPolicy()). 29 | Component("statefulset-scheduler", dask.StatefulSetScheduler()). 30 | Component("statefulset-worker", dask.StatefulSetWorker()). 31 | Component("horizontalpodautoscaler", dask.HorizontalPodAutoscaler()). 32 | Component("statusupdate", dask.ClusterStatusUpdate()) 33 | 34 | if webhooksEnabled { 35 | reconciler.WithWebhooks() 36 | } 37 | return reconciler.Complete() 38 | } 39 | -------------------------------------------------------------------------------- /controllers/mpicluster_controller.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | ctrl "sigs.k8s.io/controller-runtime" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/mpi" 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 9 | ) 10 | 11 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=mpiclusters,verbs=get;list;watch;create;update;patch;delete 12 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=mpiclusters/status,verbs=get;update;patch 13 | //+kubebuilder:rbac:groups=distributed-compute.dominodatalab.com,resources=mpiclusters/finalizers,verbs=update 14 | 15 | // MPICluster builds a controller that reconciles MPICluster objects and registers it with the manager. 16 | func MPICluster(mgr ctrl.Manager, webhooksEnabled bool, cfg *Config) error { 17 | reconciler := core.NewReconciler(mgr). 18 | For(&dcv1alpha1.MPICluster{}). 19 | Component("istio-peerauthentication", mpi.IstioPeerAuthentication(cfg.IstioEnabled)). 20 | Component("istio-client-peerauthentication", mpi.IstioClientPeerAuthentication(cfg.IstioEnabled)). 21 | Component("serviceaccount", mpi.ServiceAccount()). 22 | Component("role", mpi.RolePodSecurityPolicy()). 23 | Component("rolebinding", mpi.RoleBindingPodSecurityPolicy()). 24 | Component("configmap", mpi.ConfigMap()). 25 | Component("service-worker", mpi.ServiceWorker()). 26 | Component("service-proxy", mpi.ClientPortsService()). 27 | Component("service-client", mpi.ServiceClient()). 28 | Component("networkpolicy-worker", mpi.NetworkPolicyWorker()). 29 | Component("networkpolicy-client", mpi.NetworkPolicyClient()). 30 | Component("networkpolicy-proxy", mpi.ClientPortsNetworkPolicy()). 31 | Component("workers", mpi.StatefulSet(cfg.MPIInitImage, cfg.MPISyncImage)). 32 | Component("statusupdate", mpi.StatusUpdate()) 33 | 34 | if webhooksEnabled { 35 | reconciler.WithWebhooks() 36 | } 37 | return reconciler.Complete() 38 | } 39 | -------------------------------------------------------------------------------- /controllers/suite_test.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | "context" 5 | "path/filepath" 6 | "testing" 7 | 8 | . "github.com/onsi/ginkgo" 9 | . "github.com/onsi/gomega" 10 | "k8s.io/client-go/kubernetes/scheme" 11 | "k8s.io/client-go/rest" 12 | ctrl "sigs.k8s.io/controller-runtime" 13 | "sigs.k8s.io/controller-runtime/pkg/client" 14 | "sigs.k8s.io/controller-runtime/pkg/envtest" 15 | logf "sigs.k8s.io/controller-runtime/pkg/log" 16 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 17 | 18 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 19 | // +kubebuilder:scaffold:imports 20 | ) 21 | 22 | var cfg *rest.Config 23 | var k8sClient client.Client 24 | var testEnv *envtest.Environment 25 | var ctx context.Context 26 | var cancel context.CancelFunc 27 | 28 | func TestAPIs(t *testing.T) { 29 | if testing.Short() { 30 | t.Skip("skipping controller suite in short mode") 31 | } 32 | 33 | RegisterFailHandler(Fail) 34 | RunSpecs(t, "Controller Suite") 35 | } 36 | 37 | var _ = BeforeSuite(func() { 38 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 39 | 40 | ctx, cancel = context.WithCancel(context.TODO()) 41 | 42 | By("bootstrapping test environment") 43 | testEnv = &envtest.Environment{ 44 | CRDDirectoryPaths: []string{filepath.Join("..", "config", "crd", "bases")}, 45 | ErrorIfCRDPathMissing: true, 46 | } 47 | 48 | var err error 49 | cfg, err = testEnv.Start() 50 | Expect(err).NotTo(HaveOccurred()) 51 | Expect(cfg).NotTo(BeNil()) 52 | 53 | err = dcv1alpha1.AddToScheme(scheme.Scheme) 54 | Expect(err).NotTo(HaveOccurred()) 55 | 56 | //+kubebuilder:scaffold:scheme 57 | 58 | k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme.Scheme}) 59 | Expect(err).NotTo(HaveOccurred()) 60 | 61 | k8sClient = k8sManager.GetClient() 62 | Expect(k8sClient).NotTo(BeNil()) 63 | 64 | config := Config{} 65 | 66 | for _, controller := range BuilderFuncs { 67 | err = controller(k8sManager, false, &config) 68 | Expect(err).ToNot(HaveOccurred()) 69 | } 70 | 71 | err = (&RayClusterReconciler{ 72 | Client: k8sClient, 73 | Scheme: k8sManager.GetScheme(), 74 | Log: ctrl.Log.WithName("controllers").WithName("RayCluster"), 75 | IstioEnabled: false, 76 | }).SetupWithManager(k8sManager) 77 | Expect(err).ToNot(HaveOccurred()) 78 | 79 | err = (&SparkClusterReconciler{ 80 | Client: k8sClient, 81 | Scheme: k8sManager.GetScheme(), 82 | Log: ctrl.Log.WithName("controllers").WithName("SparkCluster"), 83 | IstioEnabled: false, 84 | }).SetupWithManager(k8sManager) 85 | Expect(err).ToNot(HaveOccurred()) 86 | 87 | go func() { 88 | err = k8sManager.Start(ctx) 89 | Expect(err).ToNot(HaveOccurred()) 90 | }() 91 | }, 60) 92 | 93 | var _ = AfterSuite(func() { 94 | cancel() 95 | By("tearing down the test environment") 96 | err := testEnv.Stop() 97 | Expect(err).ToNot(HaveOccurred()) 98 | }) 99 | -------------------------------------------------------------------------------- /controllers/variables.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | "path" 5 | 6 | "github.com/banzaicloud/k8s-objectmatcher/patch" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | ) 10 | 11 | var ( 12 | // DistributedComputeFinalizer is the custom identifier used to mark 13 | // controller-managed resources that require pre-delete hook logic. 14 | DistributedComputeFinalizer = path.Join(dcv1alpha1.GroupVersion.Group, "finalizer") 15 | 16 | // PatchAnnotator applies state annotations to owned components. 17 | PatchAnnotator = patch.NewAnnotator(path.Join(dcv1alpha1.GroupVersion.Group, "last-applied")) 18 | // PatchMaker calculates changes to state annotations on owned components. 19 | PatchMaker = patch.NewPatchMaker(PatchAnnotator, &patch.K8sStrategicMergePatcher{}, &patch.BaseJSONMergePatcher{}) 20 | // PatchCalculateOpts define the exclusion rules used when calculating the 21 | // difference between two k8s resources. 22 | PatchCalculateOpts = []patch.CalculateOption{ 23 | patch.IgnoreStatusFields(), 24 | patch.IgnoreVolumeClaimTemplateTypeMetaAndStatus(), 25 | } 26 | ) 27 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/Chart.lock: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: common 3 | repository: https://charts.bitnami.com/bitnami 4 | version: 1.4.1 5 | digest: sha256:c53e3c3325fc8b9b8b41efd417bad52765452600992fe8612c8cb062725b505a 6 | generated: "2021-03-11T13:33:51.03347-07:00" 7 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | type: application 3 | name: distributed-compute-operator 4 | version: 0.0.0 5 | appVersion: "latest" 6 | kubeVersion: ">= 1.16.0-0" 7 | description: Kubernetes operator that manages parallel computing clusters. 8 | home: https://github.com/dominodatalab/distributed-compute-operator 9 | icon: https://raw.githubusercontent.com/dominodatalab/distributed-compute-operator/main/docs/img/logo.png 10 | maintainers: 11 | - name: sonnysideup 12 | email: eng-platform@dominodatalab.com 13 | dependencies: 14 | - name: common 15 | version: 1.4.1 16 | repository: https://charts.bitnami.com/bitnami 17 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/charts/common-1.4.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominodatalab/distributed-compute-operator/2f119d377b1632ea2e9817cacc24a6a54414aed1/deploy/helm/distributed-compute-operator/charts/common-1.4.1.tgz -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/dco-values.yaml: -------------------------------------------------------------------------------- 1 | USER-SUPPLIED VALUES: 2 | image: 3 | registry: quay.io 4 | repository: domino/distributed-compute-operator 5 | tag: v0.7.3 6 | pullPolicy: Always 7 | imagePullSecrets: 8 | - name: domino-quay-repos 9 | installCRDs: true 10 | global: 11 | istio: 12 | cni: true 13 | enabled: false 14 | install: false 15 | mpi: 16 | initImage: 17 | registry: quay.io 18 | repository: domino/distributed-compute-operator-mpi-init 19 | tag: v0.7.3 20 | syncImage: 21 | registry: quay.io 22 | repository: domino/distributed-compute-operator-mpi-sync 23 | tag: v0.7.3 24 | networkPolicy: 25 | enabled: true 26 | nodeSelector: 27 | dominodatalab.com/node-pool: default 28 | podAnnotations: {} 29 | podEnv: [] 30 | podLabels: {} 31 | podSecurityPolicy: 32 | enabled: true 33 | priorityClassName: domino-default 34 | prometheus: 35 | enabled: true 36 | namespaceLabels: 37 | domino-platform: "true" 38 | rbac: 39 | pspEnabled: true 40 | replicaCount: 1 41 | securityContextConstraints: 42 | enabled: false 43 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing the {{ .Chart.Name }}. 2 | 3 | Your release is named {{ .Release.Name }}. 4 | 5 | To learn more about the release, try: 6 | 7 | $ helm status {{ .Release.Name }} 8 | $ helm get all {{ .Release.Name }} 9 | 10 | To list the available cluster types this operator manages, try: 11 | 12 | $ kubectl get crds | grep distributed-compute 13 | 14 | See the following samples to learn how to create a new cluster: 15 | 16 | https://github.com/dominodatalab/distributed-compute-operator/tree/main/config/samples 17 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Return the proper image name 3 | */}} 4 | {{- define "dco.image" -}} 5 | {{- $imageRoot := .Values.image -}} 6 | {{- $_ := set $imageRoot "tag" (.Values.image.tag | default .Chart.AppVersion) -}} 7 | {{- include "common.images.image" (dict "imageRoot" $imageRoot "global" $) -}} 8 | {{- end -}} 9 | 10 | {{/* 11 | Create the name of the service account to use 12 | */}} 13 | {{- define "dco.serviceAccountName" -}} 14 | {{- if .Values.serviceAccount.create -}} 15 | {{- default (include "common.names.fullname" .) .Values.serviceAccount.name -}} 16 | {{- else -}} 17 | {{- default "default" .Values.serviceAccount.name -}} 18 | {{- end -}} 19 | {{- end -}} 20 | 21 | {{/* 22 | Webhook service name 23 | */}} 24 | {{- define "dco.webhook.service" -}} 25 | {{ include "common.names.fullname" . }}-webhook-server 26 | {{- end -}} 27 | 28 | {{/* 29 | Webhook certificate CA name 30 | */}} 31 | {{- define "dco.webhook.issuer" -}} 32 | {{ include "common.names.fullname" . }}-selfsigned-issuer 33 | {{- end -}} 34 | 35 | {{/* 36 | Webhook certificate name 37 | */}} 38 | {{- define "dco.webhook.certificate" -}} 39 | {{ include "common.names.fullname" . }}-webhook 40 | {{- end -}} 41 | 42 | {{/* 43 | Webhook certificate secret name 44 | */}} 45 | {{- define "dco.webhook.secret" -}} 46 | {{ include "common.names.fullname" . }}-webhook-cert 47 | {{- end -}} 48 | 49 | {{/* 50 | Webhook certificate injection annotation 51 | */}} 52 | {{- define "dco.webhook.annotation" -}} 53 | cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "dco.webhook.certificate" . }} 54 | {{- end -}} 55 | 56 | {{/* 57 | Returns a name suitable for all manager RBAC objects 58 | */}} 59 | {{- define "dco.rbac.managerName" -}} 60 | dominodatalab:operator:{{ include "common.names.fullname" . }}:manager 61 | {{- end -}} 62 | 63 | {{/* 64 | Returns a name suitable for all hook RBAC objects 65 | */}} 66 | {{- define "dco.rbac.hookName" -}} 67 | dominodatalab:operator:{{ include "common.names.fullname" . }}:hook 68 | {{- end -}} 69 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "dco.rbac.managerName" . }}.{{ .Release.Namespace }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | rules: 8 | - apiGroups: 9 | - distributed-compute.dominodatalab.com 10 | resources: 11 | - daskclusters 12 | - rayclusters 13 | - sparkclusters 14 | - mpiclusters 15 | verbs: 16 | - patch 17 | - update 18 | - list 19 | - watch 20 | - apiGroups: 21 | - distributed-compute.dominodatalab.com 22 | resources: 23 | - daskclusters/status 24 | - rayclusters/status 25 | - sparkclusters/status 26 | - mpiclusters/status 27 | verbs: 28 | - update 29 | - apiGroups: 30 | - distributed-compute.dominodatalab.com 31 | resources: 32 | - daskclusters/finalizers 33 | - rayclusters/finalizers 34 | - sparkclusters/finalizers 35 | - mpiclusters/finalizers 36 | verbs: 37 | - update 38 | - apiGroups: 39 | - "" 40 | resources: 41 | - endpoints 42 | - pods 43 | verbs: 44 | - list 45 | - watch 46 | - apiGroups: 47 | - "" 48 | resources: 49 | - configmaps 50 | - secrets 51 | - serviceaccounts 52 | verbs: 53 | - create 54 | - update 55 | - delete 56 | - list 57 | - watch 58 | - apiGroups: 59 | - "" 60 | resources: 61 | - services 62 | verbs: 63 | - create 64 | - update 65 | - list 66 | - watch 67 | - apiGroups: 68 | - "" 69 | resources: 70 | - persistentvolumeclaims 71 | verbs: 72 | - delete 73 | - list 74 | - watch 75 | - apiGroups: 76 | - apps 77 | resources: 78 | - statefulsets 79 | verbs: 80 | - create 81 | - update 82 | - list 83 | - watch 84 | - delete 85 | - apiGroups: 86 | - autoscaling 87 | resources: 88 | - horizontalpodautoscalers 89 | verbs: 90 | - create 91 | - update 92 | - delete 93 | - list 94 | - watch 95 | - apiGroups: 96 | - batch 97 | resources: 98 | - jobs 99 | verbs: 100 | - create 101 | - update 102 | - delete 103 | - list 104 | - watch 105 | - apiGroups: 106 | - networking.k8s.io 107 | resources: 108 | - networkpolicies 109 | verbs: 110 | - create 111 | - update 112 | - delete 113 | - list 114 | - watch 115 | - apiGroups: 116 | - policy 117 | resources: 118 | - podsecuritypolicies 119 | resourceNames: 120 | - domino-restricted 121 | verbs: 122 | - use 123 | - list 124 | - watch 125 | - apiGroups: 126 | - rbac.authorization.k8s.io 127 | resources: 128 | - roles 129 | - rolebindings 130 | verbs: 131 | - create 132 | - update 133 | - delete 134 | - list 135 | - watch 136 | - apiGroups: 137 | - security.istio.io 138 | resources: 139 | - peerauthentications 140 | verbs: 141 | - create 142 | - update 143 | - delete 144 | - list 145 | - watch 146 | - apiGroups: 147 | - networking.istio.io 148 | resources: 149 | - envoyfilters 150 | verbs: 151 | - create 152 | - update 153 | - list 154 | - watch 155 | {{- if .Values.config.enableLeaderElection }} 156 | - apiGroups: 157 | - "" 158 | resources: 159 | - configmaps 160 | verbs: 161 | - get 162 | - apiGroups: 163 | - "" 164 | resources: 165 | - events 166 | verbs: 167 | - create 168 | - apiGroups: 169 | - coordination.k8s.io 170 | resources: 171 | - leases 172 | verbs: 173 | - get 174 | - create 175 | - update 176 | {{- end }} 177 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "dco.rbac.managerName" . }}.{{ .Release.Namespace }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: {{ include "dco.rbac.managerName" . }}.{{ .Release.Namespace }} 11 | subjects: 12 | - kind: ServiceAccount 13 | name: {{ include "dco.serviceAccountName" . }} 14 | namespace: {{ .Release.Namespace }} 15 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/istio.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.global.istio.enabled }} 2 | apiVersion: security.istio.io/v1beta1 3 | kind: PeerAuthentication 4 | metadata: 5 | name: {{ include "dco.webhook.service" .}} 6 | labels: 7 | {{- include "common.labels.standard" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | {{- include "common.labels.matchLabels" . | nindent 6 }} 12 | mtls: 13 | mode: UNSET 14 | portLevelMtls: 15 | {{ .Values.config.webhookPort }}: 16 | mode: DISABLE 17 | {{- end }} 18 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/networkpolicy.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.networkPolicy.enabled }} 2 | apiVersion: networking.k8s.io/v1 3 | kind: NetworkPolicy 4 | metadata: 5 | name: {{ include "common.names.fullname" . }} 6 | labels: 7 | {{- include "common.labels.standard" . | nindent 4 }} 8 | spec: 9 | podSelector: 10 | matchLabels: 11 | {{- include "common.labels.matchLabels" . | nindent 6 }} 12 | policyTypes: 13 | - Ingress 14 | ingress: 15 | - ports: 16 | - port: {{ .Values.config.webhookPort }} 17 | protocol: TCP 18 | - port: {{ .Values.config.healthProbePort }} 19 | protocol: TCP 20 | - ports: 21 | - port: {{ .Values.config.metricsPort }} 22 | protocol: TCP 23 | from: 24 | - podSelector: 25 | matchLabels: 26 | {{- toYaml .Values.prometheus.podLabels | trimSuffix "\n" | nindent 10 }} 27 | {{- with .Values.prometheus.namespaceLabels }} 28 | namespaceSelector: 29 | matchLabels: 30 | {{- toYaml . | trimSuffix "\n" | nindent 10 }} 31 | {{- end }} 32 | {{- end }} 33 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create }} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "dco.serviceAccountName" . }} 6 | labels: 7 | {{- include "common.labels.standard" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/webhook-cert-manager.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Issuer 3 | metadata: 4 | name: {{ include "dco.webhook.issuer" . }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | spec: 8 | selfSigned: {} 9 | 10 | --- 11 | apiVersion: cert-manager.io/v1 12 | kind: Certificate 13 | metadata: 14 | name: {{ include "dco.webhook.certificate" . }} 15 | labels: 16 | {{- include "common.labels.standard" . | nindent 4 }} 17 | spec: 18 | dnsNames: 19 | - {{ include "dco.webhook.service" . }}.{{ .Release.Namespace }}.svc 20 | - {{ include "dco.webhook.service" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }} 21 | issuerRef: 22 | kind: Issuer 23 | name: {{ include "dco.webhook.issuer" . }} 24 | secretName: {{ include "dco.webhook.secret" . }} 25 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/webhook-configuration-mutating.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | name: {{ include "common.names.fullname" . }}.{{ .Release.Namespace }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | annotations: 8 | {{- include "dco.webhook.annotation" . | nindent 4 }} 9 | webhooks: 10 | - admissionReviewVersions: 11 | - v1 12 | - v1beta1 13 | clientConfig: 14 | service: 15 | name: {{ include "dco.webhook.service" . }} 16 | namespace: {{ .Release.Namespace }} 17 | path: /mutate-distributed-compute-dominodatalab-com-v1alpha1-mpicluster 18 | failurePolicy: Fail 19 | name: mmpicluster.kb.io 20 | rules: 21 | - apiGroups: 22 | - distributed-compute.dominodatalab.com 23 | apiVersions: 24 | - v1alpha1 25 | operations: 26 | - CREATE 27 | - UPDATE 28 | resources: 29 | - mpiclusters 30 | sideEffects: None 31 | - admissionReviewVersions: 32 | - v1 33 | - v1beta1 34 | clientConfig: 35 | service: 36 | name: {{ include "dco.webhook.service" . }} 37 | namespace: {{ .Release.Namespace }} 38 | path: /mutate-distributed-compute-dominodatalab-com-v1alpha1-daskcluster 39 | failurePolicy: Fail 40 | name: mdaskcluster.kb.io 41 | rules: 42 | - apiGroups: 43 | - distributed-compute.dominodatalab.com 44 | apiVersions: 45 | - v1alpha1 46 | operations: 47 | - CREATE 48 | - UPDATE 49 | resources: 50 | - daskclusters 51 | sideEffects: None 52 | - admissionReviewVersions: 53 | - v1 54 | - v1beta1 55 | clientConfig: 56 | service: 57 | name: {{ include "dco.webhook.service" . }} 58 | namespace: {{ .Release.Namespace }} 59 | path: /mutate-distributed-compute-dominodatalab-com-v1alpha1-raycluster 60 | failurePolicy: Fail 61 | name: mraycluster.kb.io 62 | rules: 63 | - apiGroups: 64 | - distributed-compute.dominodatalab.com 65 | apiVersions: 66 | - v1alpha1 67 | operations: 68 | - CREATE 69 | - UPDATE 70 | resources: 71 | - rayclusters 72 | sideEffects: None 73 | - admissionReviewVersions: 74 | - v1 75 | - v1beta1 76 | clientConfig: 77 | service: 78 | name: {{ include "dco.webhook.service" . }} 79 | namespace: {{ .Release.Namespace }} 80 | path: /mutate-distributed-compute-dominodatalab-com-v1alpha1-sparkcluster 81 | failurePolicy: Fail 82 | name: msparkcluster.kb.io 83 | rules: 84 | - apiGroups: 85 | - distributed-compute.dominodatalab.com 86 | apiVersions: 87 | - v1alpha1 88 | operations: 89 | - CREATE 90 | - UPDATE 91 | resources: 92 | - sparkclusters 93 | sideEffects: None 94 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/webhook-configuration-validating.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingWebhookConfiguration 3 | metadata: 4 | name: {{ include "common.names.fullname" . }}.{{ .Release.Namespace }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | annotations: 8 | {{- include "dco.webhook.annotation" . | nindent 4 }} 9 | webhooks: 10 | - admissionReviewVersions: 11 | - v1 12 | - v1beta1 13 | clientConfig: 14 | service: 15 | name: {{ include "dco.webhook.service" . }} 16 | namespace: {{ .Release.Namespace }} 17 | path: /validate-distributed-compute-dominodatalab-com-v1alpha1-mpicluster 18 | failurePolicy: Fail 19 | name: vmpicluster.kb.io 20 | rules: 21 | - apiGroups: 22 | - distributed-compute.dominodatalab.com 23 | apiVersions: 24 | - v1alpha1 25 | operations: 26 | - CREATE 27 | - UPDATE 28 | resources: 29 | - mpiclusters 30 | sideEffects: None 31 | - admissionReviewVersions: 32 | - v1 33 | - v1beta1 34 | clientConfig: 35 | service: 36 | name: {{ include "dco.webhook.service" . }} 37 | namespace: {{ .Release.Namespace }} 38 | path: /validate-distributed-compute-dominodatalab-com-v1alpha1-daskcluster 39 | failurePolicy: Fail 40 | name: vdaskcluster.kb.io 41 | rules: 42 | - apiGroups: 43 | - distributed-compute.dominodatalab.com 44 | apiVersions: 45 | - v1alpha1 46 | operations: 47 | - CREATE 48 | - UPDATE 49 | resources: 50 | - daskclusters 51 | sideEffects: None 52 | - admissionReviewVersions: 53 | - v1 54 | - v1beta1 55 | clientConfig: 56 | service: 57 | name: {{ include "dco.webhook.service" . }} 58 | namespace: {{ .Release.Namespace }} 59 | path: /validate-distributed-compute-dominodatalab-com-v1alpha1-raycluster 60 | failurePolicy: Fail 61 | name: vraycluster.kb.io 62 | rules: 63 | - apiGroups: 64 | - distributed-compute.dominodatalab.com 65 | apiVersions: 66 | - v1alpha1 67 | operations: 68 | - CREATE 69 | - UPDATE 70 | resources: 71 | - rayclusters 72 | sideEffects: None 73 | - admissionReviewVersions: 74 | - v1 75 | - v1beta1 76 | clientConfig: 77 | service: 78 | name: {{ include "dco.webhook.service" . }} 79 | namespace: {{ .Release.Namespace }} 80 | path: /validate-distributed-compute-dominodatalab-com-v1alpha1-sparkcluster 81 | failurePolicy: Fail 82 | name: vsparkcluster.kb.io 83 | rules: 84 | - apiGroups: 85 | - distributed-compute.dominodatalab.com 86 | apiVersions: 87 | - v1alpha1 88 | operations: 89 | - CREATE 90 | - UPDATE 91 | resources: 92 | - sparkclusters 93 | sideEffects: None 94 | -------------------------------------------------------------------------------- /deploy/helm/distributed-compute-operator/templates/webhook-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "dco.webhook.service" . }} 5 | labels: 6 | {{- include "common.labels.standard" . | nindent 4 }} 7 | spec: 8 | ports: 9 | - name: tcp-webhook 10 | port: 443 11 | targetPort: webhooks 12 | selector: 13 | {{- include "common.labels.matchLabels" . | nindent 4 }} 14 | -------------------------------------------------------------------------------- /dockerfiles/mpi-init.Dockerfile: -------------------------------------------------------------------------------- 1 | # A specific version of the Linux OS here is very important, because it defines versions 2 | # of core libraries (libc etc) the compiled binaries will be linked against. 3 | # FYI, debian-9.13 -> libc-2.24 4 | # OSRP not neccessary here because it's just the build environment, see the final image FROM at the bottom 5 | FROM quay.io/domino/debian:10.11-368763 6 | 7 | ARG OPENSSH_VERSION=8.8p1 8 | ARG OPENSSH_URL=https://mirrors.mit.edu/pub/OpenBSD/OpenSSH/portable/openssh-${OPENSSH_VERSION}.tar.gz 9 | ARG OPENSSH_SIG_URL=https://mirrors.mit.edu/pub/OpenBSD/OpenSSH/portable/openssh-${OPENSSH_VERSION}.tar.gz.asc 10 | 11 | ARG INSTALL_DIR=/opt/domino/mpi-cluster 12 | ARG INSTALL_BIN=${INSTALL_DIR}/bin 13 | 14 | WORKDIR /root 15 | 16 | ADD *.gpgkey ./ 17 | 18 | # Install common dependencies for the compiler and setting things up 19 | RUN \ 20 | apt-get update && \ 21 | apt-get -y install \ 22 | build-essential \ 23 | curl \ 24 | gnupg && \ 25 | mkdir -p \ 26 | ${INSTALL_DIR} \ 27 | ${INSTALL_BIN} && \ 28 | gpg --import -q openssh.gpgkey > /dev/null && \ 29 | rm -f *.gpgkey 30 | 31 | # Download an compile openssh 32 | RUN \ 33 | # Newer versions of openssh include a mandatory privilege separation mechanism 34 | # that requires a special user to be available in the system. Although this 35 | # image does not execute sshd, such a user must exist for proper deployment. 36 | useradd -g 65534 -d /var/empty -s /bin/false sshd && \ 37 | curl -o openssh-src.tgz -LSsf ${OPENSSH_URL} && \ 38 | curl -o openssh-src.sig -LSsf ${OPENSSH_SIG_URL} && \ 39 | gpg --trust-model always -q --verify openssh-src.sig openssh-src.tgz && \ 40 | tar -xf openssh-src.tgz --no-same-permissions && \ 41 | cd openssh-${OPENSSH_VERSION} && \ 42 | ./configure \ 43 | --prefix=${INSTALL_DIR} \ 44 | --without-zlib \ 45 | --without-openssl && \ 46 | make && \ 47 | make install && \ 48 | cd - 49 | 50 | ADD mpi-worker-start.sh ${INSTALL_BIN} 51 | 52 | # Create a tarball containing all the necessary stuff 53 | RUN \ 54 | rm -f ${INSTALL_DIR}/etc/ssh_host_* && \ 55 | chmod 755 ${INSTALL_BIN}/mpi-worker-start.sh && \ 56 | tar -czf worker-utils.tgz \ 57 | ${INSTALL_DIR}/bin \ 58 | ${INSTALL_DIR}/etc \ 59 | ${INSTALL_DIR}/libexec \ 60 | ${INSTALL_DIR}/sbin 61 | 62 | # The final image only contains built artifacts. 63 | # The base image should be up-to-date, but a specific version is not important. 64 | FROM quay.io/domino/debian:10.11-368763 65 | WORKDIR /root 66 | COPY --from=0 /root/worker-utils.tgz ./ 67 | CMD tar -C / -xf /root/worker-utils.tgz 68 | -------------------------------------------------------------------------------- /dockerfiles/mpi-sync.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/domino/debian:10.11-368763 2 | 3 | ARG DOMINO_UID=12574 4 | ARG DOMINO_USER=domino 5 | ARG DOMINO_GID=12574 6 | ARG DOMINO_GROUP=domino 7 | 8 | ARG DOMINO_DIR=/opt/domino/rsync 9 | ARG DOMINO_BIN=$DOMINO_DIR/bin 10 | ARG DOMINO_ETC=$DOMINO_DIR/etc 11 | 12 | ARG RSYNC_RUN_DIR=/run/rsyncd-${DOMINO_USER} 13 | ARG RSYNC_CONFIG_FILE=rsyncd.conf 14 | ARG RSYNC_START_SCRIPT=rsync-start.sh 15 | 16 | ARG ALLENV="\$RSYNC_RUN_DIR,\$DOMINO_ETC,\$RSYNC_CONFIG_FILE" 17 | 18 | WORKDIR /root 19 | 20 | RUN \ 21 | apt-get update && \ 22 | apt-get -y install \ 23 | rsync \ 24 | gettext-base \ 25 | procps && \ 26 | rm -rf /var/lib/apt/lists/* && \ 27 | mkdir -p \ 28 | "$DOMINO_DIR" \ 29 | "$DOMINO_BIN" \ 30 | "$DOMINO_ETC" \ 31 | "$RSYNC_RUN_DIR" 32 | 33 | ADD $RSYNC_START_SCRIPT $RSYNC_CONFIG_FILE ./ 34 | 35 | RUN \ 36 | groupadd -g $DOMINO_GID $DOMINO_GROUP && \ 37 | useradd -u $DOMINO_UID -g $DOMINO_GID -mN -s /bin/bash $DOMINO_USER && \ 38 | envsubst "$ALLENV" < "$RSYNC_START_SCRIPT" > "$DOMINO_BIN/$RSYNC_START_SCRIPT" && \ 39 | envsubst "$ALLENV" < "$RSYNC_CONFIG_FILE" > "$DOMINO_ETC/$RSYNC_CONFIG_FILE" && \ 40 | chown -R $DOMINO_USER:$DOMINO_GROUP "$RSYNC_RUN_DIR" && \ 41 | chown -R $DOMINO_USER:$DOMINO_GROUP "$DOMINO_DIR" && \ 42 | chmod 755 "$DOMINO_BIN/$RSYNC_START_SCRIPT" && \ 43 | chmod 644 "$DOMINO_ETC/$RSYNC_CONFIG_FILE" 44 | 45 | # For testing -- to be removed 46 | RUN \ 47 | chown -R $DOMINO_USER:$DOMINO_GROUP /mnt 48 | -------------------------------------------------------------------------------- /dockerfiles/mpi-worker-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | INSTALL_DIR="/opt/domino/mpi-cluster" 7 | SSH_USER="sshd" 8 | SSH_RUN_DIR="/run/sshd-${DOMINO_USER}" 9 | 10 | mkdir -p "$SSH_RUN_DIR" 11 | chmod 777 "$SSH_RUN_DIR" 12 | 13 | if ! id $SSH_USER >/dev/null 2>&1; then 14 | useradd -g 65534 -mN -s "/usr/sbin/nologin" -d "$SSH_RUN_DIR" $SSH_USER 15 | fi 16 | 17 | if ! cut -d: -f3 < /etc/group | grep "^${DOMINO_GID}$" >/dev/null 2>&1; then 18 | groupadd -g $DOMINO_GID $DOMINO_GROUP 19 | fi 20 | if ! id $DOMINO_UID >/dev/null 2>&1; then 21 | useradd -u $DOMINO_UID -g $DOMINO_GID -mN -s /bin/bash -d "$DOMINO_HOME_DIR" $DOMINO_USER 22 | else 23 | # Change username of user with matching userid if needed 24 | EXISTING_USER=$(id -nu $DOMINO_UID) 25 | if [ "$EXISTING_USER" != "$DOMINO_USER" ]; then 26 | usermod -l $DOMINO_USER $EXISTING_USER 27 | fi 28 | 29 | # Change groupname of group with matching groupid if needed 30 | EXISTING_GROUP=$(id -ng $DOMINO_GID) 31 | if [ "$EXISTING_GROUP" != "$DOMINO_GROUP" ]; then 32 | groupmod --new-name $DOMINO_GROUP $EXISTING_GROUP 33 | fi 34 | 35 | # Change home directory (idempotent) 36 | usermod -d "$DOMINO_HOME_DIR" $DOMINO_USER 37 | 38 | # Add to domino group (idempotent) 39 | usermod -a -G $DOMINO_GROUP $DOMINO_USER 40 | fi 41 | 42 | 43 | # Add the new domino user to the non-root groups of the current container user 44 | for gid in `id -G`; do 45 | if [ $gid != 0 ]; then 46 | # Add user to a new/existing group with desired id. 47 | # https://askubuntu.com/a/639998 48 | group_name=$(cut -d: -f1 < <(getent group $gid)) 49 | if [ -z "$group_name" ]; then 50 | group_name="group-$gid" 51 | groupadd -g $gid $group_name 52 | fi 53 | usermod -a -G $group_name $DOMINO_USER 54 | fi 55 | done 56 | 57 | cat << EOF > "$DOMINO_HOME_DIR/.profile" 58 | if [ "\$BASH" ] && [ -f ~/.bashrc ]; then 59 | . ~/.bashrc 60 | fi 61 | EOF 62 | chmod 644 "$DOMINO_HOME_DIR/.profile" 63 | chown $DOMINO_UID:$DOMINO_GID "$DOMINO_HOME_DIR/.profile" 64 | 65 | rm -f "$DOMINO_HOME_DIR/.bashrc" 66 | touch "$DOMINO_HOME_DIR/.bashrc" 67 | printenv | grep PATH | sed 's;^;export ;' >> "$DOMINO_HOME_DIR/.bashrc" 68 | printenv | grep MPI | sed 's;^;export ;' >> "$DOMINO_HOME_DIR/.bashrc" 69 | printenv | grep DOMINO | sed 's;^;export ;' >> "$DOMINO_HOME_DIR/.bashrc" 70 | chmod 644 "$DOMINO_HOME_DIR/.bashrc" 71 | chown $DOMINO_UID:$DOMINO_GID "$DOMINO_HOME_DIR/.bashrc" 72 | 73 | CONFIG_DIR="$INSTALL_DIR/etc" 74 | mkdir -p "$CONFIG_DIR" 75 | 76 | rm -f "$CONFIG_DIR/ssh_host_*" 77 | "$INSTALL_DIR/bin/ssh-keygen" -f "$CONFIG_DIR/ssh_host_key" -N '' -t ed25519 78 | chmod 400 "$CONFIG_DIR/ssh_host_key" 79 | chown $DOMINO_UID:$DOMINO_GID "$CONFIG_DIR/ssh_host_key" 80 | 81 | cat << EOF > "$CONFIG_DIR/sshd_config" 82 | Port $DOMINO_SSH_PORT 83 | HostKey "$CONFIG_DIR/ssh_host_key" 84 | AuthorizedKeysFile "$DOMINO_KEYS_PATH" 85 | PidFile "$SSH_RUN_DIR/sshd.pid" 86 | AllowUsers $DOMINO_USER 87 | EOF 88 | chmod 444 "$CONFIG_DIR/sshd_config" 89 | 90 | su -c "$INSTALL_DIR/sbin/sshd -f \"$CONFIG_DIR/sshd_config\" -De" - $DOMINO_USER 91 | -------------------------------------------------------------------------------- /dockerfiles/rsync-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | /usr/bin/rsync \ 7 | --daemon \ 8 | --no-detach \ 9 | --verbose \ 10 | --config="$DOMINO_ETC/$RSYNC_CONFIG_FILE" \ 11 | --port=$RSYNC_PORT 12 | -------------------------------------------------------------------------------- /dockerfiles/rsyncd.conf: -------------------------------------------------------------------------------- 1 | pid file = $RSYNC_RUN_DIR/rsync.pid 2 | lock file = $RSYNC_RUN_DIR/rsync.lock 3 | log file = /dev/stdout 4 | use chroot = false 5 | read only = false 6 | timeout = 300 7 | 8 | [mnt] 9 | path = /mnt 10 | 11 | [repos] 12 | path = /repos 13 | 14 | [imported] 15 | path = /mnt/imported 16 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominodatalab/distributed-compute-operator/2f119d377b1632ea2e9817cacc24a6a54414aed1/docs/development.md -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominodatalab/distributed-compute-operator/2f119d377b1632ea2e9817cacc24a6a54414aed1/docs/img/logo.png -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominodatalab/distributed-compute-operator/2f119d377b1632ea2e9817cacc24a6a54414aed1/hack/boilerplate.go.txt -------------------------------------------------------------------------------- /istio/global-strict-mtls.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1beta1 2 | kind: PeerAuthentication 3 | metadata: 4 | name: default 5 | namespace: istio-system 6 | spec: 7 | mtls: 8 | mode: STRICT 9 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/dominodatalab/distributed-compute-operator/cmd" 4 | 5 | func main() { 6 | cmd.Execute() 7 | } 8 | -------------------------------------------------------------------------------- /pkg/cluster/dask/clientports.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 9 | ) 10 | 11 | func ClientPortsService() core.OwnedComponent { 12 | return components.ClientPortsServiceComponent{ 13 | ClientPorts: func(obj *client.Object) []corev1.ServicePort { 14 | return daskCluster(*obj).Spec.AdditionalClientPorts 15 | }, 16 | ClientLabels: func(obj *client.Object) map[string]string { 17 | return daskCluster(*obj).Spec.NetworkPolicy.ClientLabels 18 | }, 19 | Meta: meta, 20 | } 21 | } 22 | 23 | func ClientPortsNetworkPolicy() core.OwnedComponent { 24 | return components.ClientPortsNetworkPolicyComponent{ 25 | ClientPorts: func(obj *client.Object) []corev1.ServicePort { 26 | return daskCluster(*obj).Spec.AdditionalClientPorts 27 | }, 28 | ClientLabels: func(obj *client.Object) map[string]string { 29 | return daskCluster(*obj).Spec.NetworkPolicy.ClientLabels 30 | }, 31 | Meta: meta, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /pkg/cluster/dask/clusterstatusupdate.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | appsv1 "k8s.io/api/apps/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 11 | ) 12 | 13 | func ClusterStatusUpdate() core.Component { 14 | return components.ClusterStatusUpdate(func(obj client.Object) components.ClusterStatusUpdateDataSource { 15 | return &clusterStatusUpdateDS{dc: daskCluster(obj)} 16 | }) 17 | } 18 | 19 | type clusterStatusUpdateDS struct { 20 | dc *dcv1alpha1.DaskCluster 21 | } 22 | 23 | func (c *clusterStatusUpdateDS) ListOpts() []client.ListOption { 24 | return []client.ListOption{ 25 | client.InNamespace(c.dc.Namespace), 26 | client.MatchingLabels(meta.StandardLabels(c.dc)), 27 | } 28 | } 29 | 30 | func (c *clusterStatusUpdateDS) StatefulSet() *appsv1.StatefulSet { 31 | return &appsv1.StatefulSet{ 32 | ObjectMeta: metav1.ObjectMeta{ 33 | Name: meta.InstanceName(c.dc, ComponentWorker), 34 | Namespace: c.dc.Namespace, 35 | }, 36 | } 37 | } 38 | 39 | func (c *clusterStatusUpdateDS) ClusterStatusConfig() *dcv1alpha1.ClusterStatusConfig { 40 | return &c.dc.Status.ClusterStatusConfig 41 | } 42 | 43 | func (c *clusterStatusUpdateDS) Image() *dcv1alpha1.OCIImageDefinition { 44 | return c.dc.Spec.Image 45 | } 46 | -------------------------------------------------------------------------------- /pkg/cluster/dask/configmap.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | func ConfigMapKeyTab() core.OwnedComponent { 15 | return components.ConfigMap(func(obj client.Object) components.ConfigMapDataSource { 16 | return &configMapDS{dc: daskCluster(obj)} 17 | }) 18 | } 19 | 20 | type configMapDS struct { 21 | dc *dcv1alpha1.DaskCluster 22 | } 23 | 24 | func (s *configMapDS) ConfigMap() *corev1.ConfigMap { 25 | cm := &corev1.ConfigMap{ 26 | ObjectMeta: metav1.ObjectMeta{ 27 | Name: meta.InstanceName(s.dc, metadata.ComponentNone), 28 | Namespace: s.dc.Namespace, 29 | Labels: meta.StandardLabels(s.dc), 30 | }, 31 | } 32 | 33 | if s.dc.Spec.KerberosKeytab == nil { 34 | return cm 35 | } 36 | cm.BinaryData = map[string][]byte{"keytab": s.dc.Spec.KerberosKeytab.Contents} 37 | 38 | return cm 39 | } 40 | 41 | func (s *configMapDS) Delete() bool { 42 | return s.dc.Spec.KerberosKeytab == nil 43 | } 44 | -------------------------------------------------------------------------------- /pkg/cluster/dask/dask_test.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | ) 8 | 9 | func testDaskCluster() *dcv1alpha1.DaskCluster { 10 | return &dcv1alpha1.DaskCluster{ 11 | TypeMeta: metav1.TypeMeta{ 12 | Kind: "DaskCluster", 13 | APIVersion: "distributed-compute.dominodatalab.com/v1test1", 14 | }, 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: "test", 17 | Namespace: "ns", 18 | }, 19 | Spec: dcv1alpha1.DaskClusterSpec{ 20 | ScalableClusterConfig: dcv1alpha1.ScalableClusterConfig{ 21 | ClusterConfig: dcv1alpha1.ClusterConfig{ 22 | Image: &dcv1alpha1.OCIImageDefinition{ 23 | Registry: "", 24 | Repository: "daskdev/dask", 25 | Tag: "test-tag", 26 | }, 27 | NetworkPolicy: dcv1alpha1.NetworkPolicyConfig{ 28 | ClientLabels: map[string]string{ 29 | "test-client": "true", 30 | }, 31 | DashboardLabels: map[string]string{ 32 | "test-ui-client": "true", 33 | }, 34 | DashboardNamespaceLabels: map[string]string{ 35 | "domino-platform": "true", 36 | }, 37 | }, 38 | PodSecurityPolicy: "privileged", 39 | }, 40 | }, 41 | Scheduler: dcv1alpha1.WorkloadConfig{}, 42 | Worker: dcv1alpha1.DaskClusterWorker{}, 43 | SchedulerPort: 8786, 44 | DashboardPort: 8787, 45 | WorkerPort: 3000, 46 | NannyPort: 3001, 47 | }, 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pkg/cluster/dask/horizonalpodautoscaler.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | autoscalingv2 "k8s.io/api/autoscaling/v2" 5 | corev1 "k8s.io/api/core/v1" 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | 9 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | func HorizontalPodAutoscaler() core.OwnedComponent { 16 | return components.HorizontalPodAutoscaler(func(obj client.Object) components.HorizontalPodAutoscalerDataSource { 17 | return &horizontalPodAutoscalerDS{dc: daskCluster(obj)} 18 | }) 19 | } 20 | 21 | type horizontalPodAutoscalerDS struct { 22 | dc *dcv1alpha1.DaskCluster 23 | } 24 | 25 | func (s *horizontalPodAutoscalerDS) HorizontalPodAutoscaler() *autoscalingv2.HorizontalPodAutoscaler { 26 | hpa := &autoscalingv2.HorizontalPodAutoscaler{ 27 | ObjectMeta: metav1.ObjectMeta{ 28 | Name: meta.InstanceName(s.dc, metadata.ComponentNone), 29 | Namespace: s.dc.Namespace, 30 | Labels: meta.StandardLabels(s.dc), 31 | }, 32 | } 33 | 34 | as := s.dc.Spec.Autoscaling 35 | if as == nil { 36 | return hpa 37 | } 38 | 39 | var behavior *autoscalingv2.HorizontalPodAutoscalerBehavior 40 | if as.ScaleDownStabilizationWindowSeconds != nil { 41 | behavior = &autoscalingv2.HorizontalPodAutoscalerBehavior{ 42 | ScaleDown: &autoscalingv2.HPAScalingRules{ 43 | StabilizationWindowSeconds: as.ScaleDownStabilizationWindowSeconds, 44 | }, 45 | } 46 | } 47 | 48 | var metrics []autoscalingv2.MetricSpec 49 | if as.AverageCPUUtilization != nil { 50 | metrics = append(metrics, autoscalingv2.MetricSpec{ 51 | Type: autoscalingv2.ResourceMetricSourceType, 52 | Resource: &autoscalingv2.ResourceMetricSource{ 53 | Name: corev1.ResourceCPU, 54 | Target: autoscalingv2.MetricTarget{ 55 | Type: autoscalingv2.UtilizationMetricType, 56 | AverageUtilization: s.dc.Spec.Autoscaling.AverageCPUUtilization, 57 | }, 58 | }, 59 | }) 60 | } 61 | if as.AverageMemoryUtilization != nil { 62 | metrics = append(metrics, autoscalingv2.MetricSpec{ 63 | Type: autoscalingv2.ResourceMetricSourceType, 64 | Resource: &autoscalingv2.ResourceMetricSource{ 65 | Name: corev1.ResourceMemory, 66 | Target: autoscalingv2.MetricTarget{ 67 | Type: autoscalingv2.UtilizationMetricType, 68 | AverageUtilization: s.dc.Spec.Autoscaling.AverageMemoryUtilization, 69 | }, 70 | }, 71 | }) 72 | } 73 | 74 | hpa.Spec = autoscalingv2.HorizontalPodAutoscalerSpec{ 75 | ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ 76 | Kind: s.dc.Kind, 77 | Name: s.dc.Name, 78 | APIVersion: s.dc.APIVersion, 79 | }, 80 | MinReplicas: s.dc.Spec.Autoscaling.MinReplicas, 81 | MaxReplicas: s.dc.Spec.Autoscaling.MaxReplicas, 82 | Metrics: metrics, 83 | Behavior: behavior, 84 | } 85 | 86 | return hpa 87 | } 88 | 89 | func (s *horizontalPodAutoscalerDS) Delete() bool { 90 | return s.dc.Spec.Autoscaling == nil 91 | } 92 | -------------------------------------------------------------------------------- /pkg/cluster/dask/istiopeerauthentication.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | "sigs.k8s.io/controller-runtime/pkg/client" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/resources/istio" 11 | ) 12 | 13 | func IstioPeerAuthentication(enabled bool) core.Component { 14 | return components.IstioPeerAuthentication(func(obj client.Object) components.IstioPeerAuthenticationDataSource { 15 | return &istioPeerAuthenticationDS{dc: daskCluster(obj), enabled: enabled} 16 | }) 17 | } 18 | 19 | type istioPeerAuthenticationDS struct { 20 | dc *dcv1alpha1.DaskCluster 21 | enabled bool 22 | } 23 | 24 | func (s *istioPeerAuthenticationDS) PeerAuthInfo() *istio.PeerAuthInfo { 25 | return &istio.PeerAuthInfo{ 26 | Name: meta.InstanceName(s.dc, metadata.ComponentNone), 27 | Namespace: s.dc.Namespace, 28 | Labels: meta.StandardLabels(s.dc), 29 | Selector: meta.MatchLabels(s.dc), 30 | Mode: s.dc.Spec.MutualTLSMode, 31 | } 32 | } 33 | 34 | func (s *istioPeerAuthenticationDS) Enabled() bool { 35 | return s.enabled 36 | } 37 | 38 | func (s *istioPeerAuthenticationDS) Delete() bool { 39 | return s.dc.Spec.MutualTLSMode == "" 40 | } 41 | -------------------------------------------------------------------------------- /pkg/cluster/dask/metadata.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | "sigs.k8s.io/controller-runtime/pkg/client" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 8 | ) 9 | 10 | const ( 11 | ApplicationName = "dask" 12 | ComponentScheduler metadata.Component = "scheduler" 13 | ComponentWorker metadata.Component = "worker" 14 | ) 15 | 16 | var meta = metadata.NewProvider( 17 | ApplicationName, 18 | func(obj client.Object) string { return daskCluster(obj).Spec.Image.Tag }, 19 | func(obj client.Object) map[string]string { return daskCluster(obj).Spec.GlobalLabels }, 20 | ) 21 | 22 | func daskCluster(obj client.Object) *dcv1alpha1.DaskCluster { 23 | return obj.(*dcv1alpha1.DaskCluster) 24 | } 25 | -------------------------------------------------------------------------------- /pkg/cluster/dask/rbac.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | var ( 15 | policyAPIGroups = []string{"policy"} 16 | podSecurityPolicyResources = []string{"podsecuritypolicies"} 17 | useVerbs = []string{"use"} 18 | ) 19 | 20 | func RolePodSecurityPolicy() core.OwnedComponent { 21 | return components.Role(func(obj client.Object) components.RoleDataSource { 22 | return &pspDS{dc: daskCluster(obj)} 23 | }) 24 | } 25 | 26 | func RoleBindingPodSecurityPolicy() core.OwnedComponent { 27 | return components.RoleBinding(func(obj client.Object) components.RoleBindingDataSource { 28 | return &pspDS{dc: daskCluster(obj)} 29 | }) 30 | } 31 | 32 | type pspDS struct { 33 | dc *dcv1alpha1.DaskCluster 34 | } 35 | 36 | func (s *pspDS) Role() *rbacv1.Role { 37 | return &rbacv1.Role{ 38 | ObjectMeta: s.objectMeta(), 39 | Rules: []rbacv1.PolicyRule{ 40 | { 41 | APIGroups: policyAPIGroups, 42 | Resources: podSecurityPolicyResources, 43 | Verbs: useVerbs, 44 | ResourceNames: []string{s.dc.Spec.PodSecurityPolicy}, 45 | }, 46 | }, 47 | } 48 | } 49 | 50 | func (s *pspDS) RoleBinding() *rbacv1.RoleBinding { 51 | om := s.objectMeta() 52 | 53 | return &rbacv1.RoleBinding{ 54 | ObjectMeta: om, 55 | RoleRef: rbacv1.RoleRef{ 56 | APIGroup: rbacv1.GroupName, 57 | Kind: "Role", 58 | Name: om.Name, 59 | }, 60 | Subjects: []rbacv1.Subject{ 61 | { 62 | Kind: rbacv1.ServiceAccountKind, 63 | Name: om.Name, 64 | Namespace: s.dc.Namespace, 65 | }, 66 | }, 67 | } 68 | } 69 | 70 | func (s *pspDS) Delete() bool { 71 | return s.dc.Spec.PodSecurityPolicy == "" 72 | } 73 | 74 | func (s *pspDS) objectMeta() metav1.ObjectMeta { 75 | return metav1.ObjectMeta{ 76 | Name: meta.InstanceName(s.dc, metadata.ComponentNone), 77 | Namespace: s.dc.Namespace, 78 | Labels: meta.StandardLabels(s.dc), 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /pkg/cluster/dask/rbac_test.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | rbacv1 "k8s.io/api/rbac/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ) 10 | 11 | func TestPspDS_Role(t *testing.T) { 12 | dc := testDaskCluster() 13 | ds := pspDS{dc: dc} 14 | 15 | actual := ds.Role() 16 | expected := &rbacv1.Role{ 17 | ObjectMeta: metav1.ObjectMeta{ 18 | Name: "test-dask", 19 | Namespace: "ns", 20 | Labels: map[string]string{ 21 | "app.kubernetes.io/instance": "test", 22 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 23 | "app.kubernetes.io/name": "dask", 24 | "app.kubernetes.io/version": "test-tag", 25 | }, 26 | }, 27 | Rules: []rbacv1.PolicyRule{ 28 | { 29 | APIGroups: []string{"policy"}, 30 | Resources: []string{"podsecuritypolicies"}, 31 | Verbs: []string{"use"}, 32 | ResourceNames: []string{"privileged"}, 33 | }, 34 | }, 35 | } 36 | 37 | assert.Equal(t, expected, actual) 38 | } 39 | 40 | func TestPspDS_RoleBinding(t *testing.T) { 41 | dc := testDaskCluster() 42 | ds := pspDS{dc: dc} 43 | 44 | actual := ds.RoleBinding() 45 | expected := &rbacv1.RoleBinding{ 46 | ObjectMeta: metav1.ObjectMeta{ 47 | Name: "test-dask", 48 | Namespace: "ns", 49 | Labels: map[string]string{ 50 | "app.kubernetes.io/instance": "test", 51 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 52 | "app.kubernetes.io/name": "dask", 53 | "app.kubernetes.io/version": "test-tag", 54 | }, 55 | }, 56 | RoleRef: rbacv1.RoleRef{ 57 | APIGroup: rbacv1.GroupName, 58 | Kind: "Role", 59 | Name: "test-dask", 60 | }, 61 | Subjects: []rbacv1.Subject{ 62 | { 63 | Kind: rbacv1.ServiceAccountKind, 64 | Name: "test-dask", 65 | Namespace: "ns", 66 | }, 67 | }, 68 | } 69 | 70 | assert.Equal(t, expected, actual) 71 | } 72 | 73 | func TestPspDS_Delete(t *testing.T) { 74 | dc := testDaskCluster() 75 | ds := pspDS{dc: dc} 76 | 77 | t.Run("provided_name", func(t *testing.T) { 78 | dc.Spec.PodSecurityPolicy = "restricted" 79 | assert.False(t, ds.Delete()) 80 | }) 81 | 82 | t.Run("empty_name", func(t *testing.T) { 83 | dc.Spec.PodSecurityPolicy = "" 84 | assert.True(t, ds.Delete()) 85 | }) 86 | } 87 | -------------------------------------------------------------------------------- /pkg/cluster/dask/service.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/apimachinery/pkg/util/intstr" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | 9 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | func ServiceScheduler() core.OwnedComponent { 16 | return components.Service(func(obj client.Object) components.ServiceDataSource { 17 | return &serviceDS{dc: daskCluster(obj), comp: ComponentScheduler} 18 | }) 19 | } 20 | 21 | func ServiceWorker() core.OwnedComponent { 22 | return components.Service(func(obj client.Object) components.ServiceDataSource { 23 | return &serviceDS{dc: daskCluster(obj), comp: ComponentWorker} 24 | }) 25 | } 26 | 27 | type serviceDS struct { 28 | dc *dcv1alpha1.DaskCluster 29 | comp metadata.Component 30 | } 31 | 32 | func (s *serviceDS) Service() *corev1.Service { 33 | return &corev1.Service{ 34 | ObjectMeta: metav1.ObjectMeta{ 35 | Name: meta.InstanceName(s.dc, s.comp), 36 | Namespace: s.dc.Namespace, 37 | Labels: meta.StandardLabelsWithComponent(s.dc, s.comp, nil), 38 | }, 39 | Spec: corev1.ServiceSpec{ 40 | ClusterIP: corev1.ClusterIPNone, 41 | Selector: meta.MatchLabelsWithComponent(s.dc, s.comp), 42 | Ports: s.ports(), 43 | }, 44 | } 45 | } 46 | 47 | func (s *serviceDS) ports() []corev1.ServicePort { 48 | if s.comp == ComponentScheduler { 49 | return []corev1.ServicePort{ 50 | { 51 | Name: "tcp-serve", 52 | Port: s.dc.Spec.SchedulerPort, 53 | TargetPort: intstr.FromString("serve"), 54 | }, 55 | { 56 | Name: "tcp-dashboard", 57 | Port: s.dc.Spec.DashboardPort, 58 | TargetPort: intstr.FromString("dashboard"), 59 | }, 60 | } 61 | } 62 | 63 | return []corev1.ServicePort{ 64 | { 65 | Name: "tcp-worker", 66 | Port: s.dc.Spec.WorkerPort, 67 | TargetPort: intstr.FromString("worker"), 68 | }, 69 | { 70 | Name: "tcp-nanny", 71 | Port: s.dc.Spec.NannyPort, 72 | TargetPort: intstr.FromString("nanny"), 73 | }, 74 | { 75 | Name: "tcp-dashboard", 76 | Port: s.dc.Spec.DashboardPort, 77 | TargetPort: intstr.FromString("dashboard"), 78 | }, 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /pkg/cluster/dask/service_test.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/util/intstr" 10 | ) 11 | 12 | func TestServiceDataSource_Service(t *testing.T) { 13 | dc := testDaskCluster() 14 | 15 | t.Run("scheduler", func(t *testing.T) { 16 | ds := serviceDS{dc: dc, comp: ComponentScheduler} 17 | 18 | actual := ds.Service() 19 | expected := &corev1.Service{ 20 | ObjectMeta: metav1.ObjectMeta{ 21 | Name: "test-dask-scheduler", 22 | Namespace: "ns", 23 | Labels: map[string]string{ 24 | "app.kubernetes.io/component": "scheduler", 25 | "app.kubernetes.io/instance": "test", 26 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 27 | "app.kubernetes.io/name": "dask", 28 | "app.kubernetes.io/version": "test-tag", 29 | }, 30 | }, 31 | Spec: corev1.ServiceSpec{ 32 | ClusterIP: corev1.ClusterIPNone, 33 | Selector: map[string]string{ 34 | "app.kubernetes.io/component": "scheduler", 35 | "app.kubernetes.io/instance": "test", 36 | "app.kubernetes.io/name": "dask", 37 | }, 38 | Ports: []corev1.ServicePort{ 39 | { 40 | Name: "tcp-serve", 41 | Port: 8786, 42 | TargetPort: intstr.FromString("serve"), 43 | }, 44 | { 45 | Name: "tcp-dashboard", 46 | Port: 8787, 47 | TargetPort: intstr.FromString("dashboard"), 48 | }, 49 | }, 50 | }, 51 | } 52 | 53 | assert.Equal(t, expected, actual) 54 | }) 55 | 56 | t.Run("worker", func(t *testing.T) { 57 | ds := serviceDS{dc: dc, comp: ComponentWorker} 58 | 59 | actual := ds.Service() 60 | expected := &corev1.Service{ 61 | ObjectMeta: metav1.ObjectMeta{ 62 | Name: "test-dask-worker", 63 | Namespace: "ns", 64 | Labels: map[string]string{ 65 | "app.kubernetes.io/component": "worker", 66 | "app.kubernetes.io/instance": "test", 67 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 68 | "app.kubernetes.io/name": "dask", 69 | "app.kubernetes.io/version": "test-tag", 70 | }, 71 | }, 72 | Spec: corev1.ServiceSpec{ 73 | ClusterIP: corev1.ClusterIPNone, 74 | Selector: map[string]string{ 75 | "app.kubernetes.io/component": "worker", 76 | "app.kubernetes.io/instance": "test", 77 | "app.kubernetes.io/name": "dask", 78 | }, 79 | Ports: []corev1.ServicePort{ 80 | { 81 | Name: "tcp-worker", 82 | Port: 3000, 83 | TargetPort: intstr.FromString("worker"), 84 | }, 85 | { 86 | Name: "tcp-nanny", 87 | Port: 3001, 88 | TargetPort: intstr.FromString("nanny"), 89 | }, 90 | { 91 | Name: "tcp-dashboard", 92 | Port: 8787, 93 | TargetPort: intstr.FromString("dashboard"), 94 | }, 95 | }, 96 | }, 97 | } 98 | 99 | assert.Equal(t, expected, actual) 100 | }) 101 | } 102 | -------------------------------------------------------------------------------- /pkg/cluster/dask/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/utils/pointer" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | 9 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | func ServiceAccount() core.OwnedComponent { 16 | factory := func(obj client.Object) components.ServiceAccountDataSource { 17 | return &serviceAccountDS{dc: daskCluster(obj)} 18 | } 19 | 20 | return components.ServiceAccount(factory) 21 | } 22 | 23 | type serviceAccountDS struct { 24 | dc *dcv1alpha1.DaskCluster 25 | } 26 | 27 | func (s *serviceAccountDS) ServiceAccount() *corev1.ServiceAccount { 28 | return &corev1.ServiceAccount{ 29 | ObjectMeta: metav1.ObjectMeta{ 30 | Name: meta.InstanceName(s.dc, metadata.ComponentNone), 31 | Namespace: s.dc.Namespace, 32 | Labels: meta.StandardLabels(s.dc), 33 | }, 34 | AutomountServiceAccountToken: pointer.Bool(s.dc.Spec.ServiceAccount.AutomountServiceAccountToken), 35 | } 36 | } 37 | 38 | func (s *serviceAccountDS) Delete() bool { 39 | return s.dc.Spec.ServiceAccount.Name != "" 40 | } 41 | -------------------------------------------------------------------------------- /pkg/cluster/dask/serviceaccount_test.go: -------------------------------------------------------------------------------- 1 | package dask 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | "github.com/stretchr/testify/require" 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/utils/pointer" 11 | ) 12 | 13 | func TestServiceAccountDS_ServiceAccount(t *testing.T) { 14 | dc := testDaskCluster() 15 | ds := serviceAccountDS{dc: dc} 16 | 17 | actual := ds.ServiceAccount() 18 | expected := &corev1.ServiceAccount{ 19 | ObjectMeta: metav1.ObjectMeta{ 20 | Name: "test-dask", 21 | Namespace: "ns", 22 | Labels: map[string]string{ 23 | "app.kubernetes.io/instance": "test", 24 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 25 | "app.kubernetes.io/name": "dask", 26 | "app.kubernetes.io/version": "test-tag", 27 | }, 28 | }, 29 | AutomountServiceAccountToken: pointer.Bool(false), 30 | } 31 | 32 | require.Equal(t, expected, actual) 33 | 34 | dc.Spec.ServiceAccount.AutomountServiceAccountToken = true 35 | actual = ds.ServiceAccount() 36 | 37 | assert.Equal(t, actual.AutomountServiceAccountToken, pointer.Bool(true)) 38 | } 39 | 40 | func TestServiceAccountDS_Delete(t *testing.T) { 41 | dc := testDaskCluster() 42 | ds := serviceAccountDS{dc: dc} 43 | 44 | t.Run("empty_name", func(t *testing.T) { 45 | dc.Spec.ServiceAccount.Name = "" 46 | assert.False(t, ds.Delete()) 47 | }) 48 | 49 | t.Run("provided_name", func(t *testing.T) { 50 | dc.Spec.ServiceAccount.Name = "other" 51 | assert.True(t, ds.Delete()) 52 | }) 53 | } 54 | -------------------------------------------------------------------------------- /pkg/cluster/metadata/metadata.go: -------------------------------------------------------------------------------- 1 | package metadata 2 | 3 | import ( 4 | "fmt" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/util" 9 | ) 10 | 11 | const ( 12 | // ApplicationNameLabelKey indicates the name of the application. 13 | ApplicationNameLabelKey = "app.kubernetes.io/name" 14 | // ApplicationInstanceLabelKey indicates a unique name identifying the instance of an application. 15 | ApplicationInstanceLabelKey = "app.kubernetes.io/instance" 16 | // ApplicationVersionLabelKey indicates the current version of the application. 17 | ApplicationVersionLabelKey = "app.kubernetes.io/version" 18 | // ApplicationComponentLabelKey indicates the component within the architecture of an application. 19 | ApplicationComponentLabelKey = "app.kubernetes.io/component" 20 | // ApplicationManagedByLabelKey indicates the tool being used to manage the operation of an application. 21 | ApplicationManagedByLabelKey = "app.kubernetes.io/managed-by" 22 | // ApplicationManagedByLabelValue is the specific tool being used to manage applications created by this project. 23 | ApplicationManagedByLabelValue = "distributed-compute-operator" 24 | // DescriptionAnnotationKey can be used to add extra information to a Kubernetes object via its annotations. 25 | DescriptionAnnotationKey = "distributed-compute.dominodatalab.com/description" 26 | ) 27 | 28 | // Component is used to drive Kubernetes object generation for different types. 29 | type Component string 30 | 31 | // ComponentNone indicates a generic resource. 32 | const ComponentNone Component = "none" 33 | 34 | type versionExtractor func(client.Object) string 35 | type globalLabelsFn func(client.Object) map[string]string 36 | 37 | type Provider struct { 38 | application string 39 | version versionExtractor 40 | globalLabels globalLabelsFn 41 | } 42 | 43 | func NewProvider(name string, version versionExtractor, globalLabels globalLabelsFn) *Provider { 44 | return &Provider{ 45 | application: name, 46 | version: version, 47 | globalLabels: globalLabels, 48 | } 49 | } 50 | 51 | func (p *Provider) InstanceName(obj client.Object, comp Component) string { 52 | if comp == ComponentNone { 53 | return fmt.Sprintf("%s-%s", obj.GetName(), p.application) 54 | } 55 | 56 | return fmt.Sprintf("%s-%s-%s", obj.GetName(), p.application, comp) 57 | } 58 | 59 | func (p *Provider) StandardLabels(obj client.Object) map[string]string { 60 | labels := map[string]string{ 61 | ApplicationNameLabelKey: p.application, 62 | ApplicationInstanceLabelKey: obj.GetName(), 63 | ApplicationVersionLabelKey: p.version(obj), 64 | ApplicationManagedByLabelKey: ApplicationManagedByLabelValue, 65 | } 66 | 67 | return util.MergeStringMaps(p.globalLabels(obj), labels) 68 | } 69 | 70 | func (p *Provider) StandardLabelsWithComponent(obj client.Object, comp Component, extraLabels map[string]string) map[string]string { 71 | labels := p.StandardLabels(obj) 72 | labels[ApplicationComponentLabelKey] = string(comp) 73 | 74 | if extraLabels != nil { 75 | labels = util.MergeStringMaps(extraLabels, labels) 76 | } 77 | 78 | return labels 79 | } 80 | 81 | func (p *Provider) MatchLabels(obj client.Object) map[string]string { 82 | return map[string]string{ 83 | ApplicationNameLabelKey: p.application, 84 | ApplicationInstanceLabelKey: obj.GetName(), 85 | } 86 | } 87 | 88 | func (p *Provider) MatchLabelsWithComponent(obj client.Object, comp Component) map[string]string { 89 | labels := p.MatchLabels(obj) 90 | labels[ApplicationComponentLabelKey] = string(comp) 91 | 92 | return labels 93 | } 94 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/clientports.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 9 | ) 10 | 11 | func ClientPortsService() core.OwnedComponent { 12 | return components.ClientPortsServiceComponent{ 13 | ClientPorts: func(obj *client.Object) []corev1.ServicePort { 14 | return objToMPICluster(*obj).Spec.AdditionalClientPorts 15 | }, 16 | ClientLabels: func(obj *client.Object) map[string]string { 17 | return objToMPICluster(*obj).Spec.NetworkPolicy.ClientLabels 18 | }, 19 | Meta: meta, 20 | } 21 | } 22 | 23 | func ClientPortsNetworkPolicy() core.OwnedComponent { 24 | return components.ClientPortsNetworkPolicyComponent{ 25 | ClientPorts: func(obj *client.Object) []corev1.ServicePort { 26 | return objToMPICluster(*obj).Spec.AdditionalClientPorts 27 | }, 28 | ClientLabels: func(obj *client.Object) map[string]string { 29 | return objToMPICluster(*obj).Spec.NetworkPolicy.ClientLabels 30 | }, 31 | Meta: meta, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/configmap.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ctrl "sigs.k8s.io/controller-runtime" 10 | "sigs.k8s.io/controller-runtime/pkg/client" 11 | 12 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 13 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 14 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 15 | ) 16 | 17 | func ConfigMap() core.OwnedComponent { 18 | return &configMapComponent{} 19 | } 20 | 21 | type configMapComponent struct{} 22 | 23 | func (c configMapComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 24 | cr := objToMPICluster(ctx.Object) 25 | 26 | hostFileConfig := createHostFileConfig(cr) 27 | err := actions.CreateOrUpdateOwnedResource(ctx, cr, hostFileConfig) 28 | if err != nil { 29 | return ctrl.Result{}, fmt.Errorf("cannot reconcile hostfile configmap: %w", err) 30 | } 31 | 32 | keytabConfig := createKeytabConfig(cr) 33 | if keytabConfig != nil { 34 | err := actions.CreateOrUpdateOwnedResource(ctx, cr, keytabConfig) 35 | if err != nil { 36 | return ctrl.Result{}, fmt.Errorf("cannot reconcile keytab configmap: %w", err) 37 | } 38 | } 39 | 40 | return ctrl.Result{}, nil 41 | } 42 | 43 | func (c configMapComponent) Kind() client.Object { 44 | return &corev1.ConfigMap{} 45 | } 46 | 47 | func createHostFileConfig(cr *dcv1alpha1.MPICluster) *corev1.ConfigMap { 48 | svcName := serviceName(cr, ComponentWorker) 49 | workerName := workerStatefulSetName(cr) 50 | workerReplicas := *cr.Spec.Worker.Replicas 51 | 52 | var hostFileBuilder strings.Builder 53 | for idx := 0; idx < int(workerReplicas); idx++ { 54 | entry := fmt.Sprintf("%s-%d.%s\n", workerName, idx, svcName) 55 | hostFileBuilder.WriteString(entry) 56 | } 57 | 58 | return &corev1.ConfigMap{ 59 | ObjectMeta: metav1.ObjectMeta{ 60 | Name: configMapName(cr) + "-" + hostFileName, 61 | Namespace: cr.Namespace, 62 | Labels: meta.StandardLabels(cr), 63 | }, 64 | Data: map[string]string{ 65 | hostFileName: hostFileBuilder.String(), 66 | }, 67 | } 68 | } 69 | 70 | func createKeytabConfig(cr *dcv1alpha1.MPICluster) *corev1.ConfigMap { 71 | if cr.Spec.KerberosKeytab == nil { 72 | return nil 73 | } 74 | return &corev1.ConfigMap{ 75 | ObjectMeta: metav1.ObjectMeta{ 76 | Name: configMapName(cr) + "-" + keytabName, 77 | Namespace: cr.Namespace, 78 | Labels: meta.StandardLabels(cr), 79 | }, 80 | BinaryData: map[string][]byte{ 81 | keytabName: cr.Spec.KerberosKeytab.Contents, 82 | }, 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/istiopeerauthentication.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "sigs.k8s.io/controller-runtime/pkg/client" 5 | 6 | authenticationv1alpha1 "istio.io/api/authentication/v1alpha1" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/components" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/resources/istio" 13 | ) 14 | 15 | func IstioPeerAuthentication(enabled bool) core.Component { 16 | return components.IstioPeerAuthentication(func(obj client.Object) components.IstioPeerAuthenticationDataSource { 17 | return &istioPeerAuthenticationDS{mpi: objToMPICluster(obj), enabled: enabled} 18 | }) 19 | } 20 | 21 | type istioPeerAuthenticationDS struct { 22 | mpi *dcv1alpha1.MPICluster 23 | enabled bool 24 | } 25 | 26 | func (s *istioPeerAuthenticationDS) PeerAuthInfo() *istio.PeerAuthInfo { 27 | return &istio.PeerAuthInfo{ 28 | Name: meta.InstanceName(s.mpi, metadata.ComponentNone), 29 | Namespace: s.mpi.Namespace, 30 | Labels: meta.StandardLabels(s.mpi), 31 | Selector: meta.MatchLabels(s.mpi), 32 | Mode: s.mpi.Spec.MutualTLSMode, 33 | } 34 | } 35 | 36 | func (s *istioPeerAuthenticationDS) Enabled() bool { 37 | return s.enabled 38 | } 39 | 40 | func (s *istioPeerAuthenticationDS) Delete() bool { 41 | return s.mpi.Spec.MutualTLSMode == "" 42 | } 43 | 44 | func IstioClientPeerAuthentication(enabled bool) core.Component { 45 | return components.IstioPeerAuthentication(func(obj client.Object) components.IstioPeerAuthenticationDataSource { 46 | return &istioClientPeerAuthenticationDS{mpi: objToMPICluster(obj), enabled: enabled} 47 | }) 48 | } 49 | 50 | type istioClientPeerAuthenticationDS struct { 51 | mpi *dcv1alpha1.MPICluster 52 | enabled bool 53 | } 54 | 55 | func (s *istioClientPeerAuthenticationDS) PeerAuthInfo() *istio.PeerAuthInfo { 56 | return &istio.PeerAuthInfo{ 57 | Name: meta.InstanceName(s.mpi, ComponentClient), 58 | Namespace: s.mpi.Namespace, 59 | Labels: meta.StandardLabels(s.mpi), 60 | Selector: s.mpi.Spec.NetworkPolicy.ClientLabels, 61 | Mode: authenticationv1alpha1.MutualTls_PERMISSIVE.String(), 62 | } 63 | } 64 | 65 | func (s *istioClientPeerAuthenticationDS) Enabled() bool { 66 | return s.enabled && s.mpi.Spec.Worker.Annotations["sidecar.istio.io/inject"] == "false" 67 | } 68 | 69 | func (s *istioClientPeerAuthenticationDS) Delete() bool { 70 | return false 71 | } 72 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/metadata.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "sigs.k8s.io/controller-runtime/pkg/client" 5 | 6 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 7 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 8 | ) 9 | 10 | const ( 11 | ApplicationName = "mpi" 12 | ComponentWorker metadata.Component = "worker" 13 | ComponentClient metadata.Component = "client" 14 | ) 15 | 16 | var meta = metadata.NewProvider( 17 | ApplicationName, 18 | func(obj client.Object) string { return objToMPICluster(obj).Spec.Image.Tag }, 19 | func(obj client.Object) map[string]string { return objToMPICluster(obj).Spec.GlobalLabels }, 20 | ) 21 | 22 | func objToMPICluster(obj client.Object) *dcv1alpha1.MPICluster { 23 | return obj.(*dcv1alpha1.MPICluster) 24 | } 25 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/mpi.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "time" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 10 | ) 11 | 12 | const ( 13 | // SSH port used by MPI worker 14 | sshdPort = 2222 15 | sshdPortName = "tcp-ssh" 16 | 17 | // Locations of the mounted files and their modes 18 | authorizedKeysPath = "/etc/mpi/authorized_keys" 19 | authorizedKeysMode = 0444 // octal! 20 | 21 | // Location of common Domino utilities 22 | customUtilPath = "/opt/domino/mpi-cluster" 23 | 24 | // Default parameters of a user account for executing MPI workload. 25 | defaultUserID = 12574 26 | defaultUserName = "domino" 27 | defaultGroupID = 12574 28 | defaultGroupName = "domino" 29 | defaultHomeDir = "/mnt" 30 | 31 | // SSH ports used by rsync sidecar 32 | rsyncPort = 2223 33 | rsyncPortName = "tcp-rsync" 34 | 35 | // User and group for running the sidecar container; 36 | // they should match a user provisioned in the sidecar image. 37 | rsyncUserID = 12574 38 | rsyncGroupID = 12574 39 | 40 | // Name of an MPI hostfile; also a key in the config map and its prefix 41 | hostFileName = "hostfile" 42 | 43 | // Name of a Kerberos keytab file; also a key in the config map and its prefix 44 | keytabName = "keytab" 45 | 46 | // Period of rerunning resource finalizers 47 | finalizerRetryPeriod = 1 * time.Second 48 | ) 49 | 50 | func configMapName(cr client.Object) string { 51 | return meta.InstanceName(cr, "config") 52 | } 53 | 54 | func selectServiceAccount(cr *dcv1alpha1.MPICluster) string { 55 | if cr.Spec.ServiceAccount.Name != "" { 56 | return cr.Spec.ServiceAccount.Name 57 | } 58 | 59 | return serviceAccountName(cr) 60 | } 61 | 62 | func serviceAccountName(cr client.Object) string { 63 | return meta.InstanceName(cr, metadata.ComponentNone) 64 | } 65 | 66 | func serviceName(cr client.Object, comp metadata.Component) string { 67 | return meta.InstanceName(cr, comp) 68 | } 69 | 70 | func sshSecretName(cr *dcv1alpha1.MPICluster) string { 71 | worker := cr.Spec.Worker 72 | return worker.SharedSSHSecret 73 | } 74 | 75 | func workerStatefulSetName(cr client.Object) string { 76 | return meta.InstanceName(cr, ComponentWorker) 77 | } 78 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/networkpolicy.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | networkingv1 "k8s.io/api/networking/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ctrl "sigs.k8s.io/controller-runtime" 10 | "sigs.k8s.io/controller-runtime/pkg/client" 11 | 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 13 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 14 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 15 | ) 16 | 17 | func NetworkPolicyWorker() core.OwnedComponent { 18 | return &networkPolicyComponent{ 19 | comp: ComponentWorker, 20 | } 21 | } 22 | 23 | func NetworkPolicyClient() core.OwnedComponent { 24 | return &networkPolicyComponent{ 25 | comp: ComponentClient, 26 | } 27 | } 28 | 29 | type networkPolicyComponent struct { 30 | comp metadata.Component 31 | } 32 | 33 | func (c networkPolicyComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 34 | cr := objToMPICluster(ctx.Object) 35 | 36 | matchLabels := meta.MatchLabels(cr) 37 | 38 | var podSelectorMatchLabels map[string]string 39 | var ingressRules []networkingv1.NetworkPolicyPeer 40 | 41 | switch c.comp { 42 | case ComponentWorker: 43 | podSelectorMatchLabels = matchLabels 44 | ingressRules = []networkingv1.NetworkPolicyPeer{ 45 | { 46 | PodSelector: &metav1.LabelSelector{ 47 | MatchLabels: matchLabels, 48 | }, 49 | }, 50 | { 51 | PodSelector: &metav1.LabelSelector{ 52 | MatchLabels: cr.Spec.NetworkPolicy.ClientLabels, 53 | }, 54 | }, 55 | } 56 | case ComponentClient: 57 | podSelectorMatchLabels = cr.Spec.NetworkPolicy.ClientLabels 58 | ingressRules = []networkingv1.NetworkPolicyPeer{ 59 | { 60 | PodSelector: &metav1.LabelSelector{ 61 | MatchLabels: matchLabels, 62 | }, 63 | }, 64 | } 65 | case metadata.ComponentNone: 66 | err := errors.New("unknown component for NetworkPolicy") 67 | return ctrl.Result{}, err 68 | } 69 | 70 | netpol := &networkingv1.NetworkPolicy{ 71 | ObjectMeta: metav1.ObjectMeta{ 72 | Name: meta.InstanceName(cr, c.comp), 73 | Namespace: cr.Namespace, 74 | Labels: meta.StandardLabels(cr), 75 | }, 76 | Spec: networkingv1.NetworkPolicySpec{ 77 | PodSelector: metav1.LabelSelector{ 78 | MatchLabels: podSelectorMatchLabels, 79 | }, 80 | Ingress: []networkingv1.NetworkPolicyIngressRule{ 81 | { 82 | From: ingressRules, 83 | }, 84 | }, 85 | PolicyTypes: []networkingv1.PolicyType{ 86 | networkingv1.PolicyTypeIngress, 87 | }, 88 | }, 89 | } 90 | 91 | err := actions.CreateOrUpdateOwnedResource(ctx, cr, netpol) 92 | if err != nil { 93 | err = fmt.Errorf("cannot reconcile networkpolicy: %w", err) 94 | } 95 | 96 | return ctrl.Result{}, err 97 | } 98 | 99 | func (c networkPolicyComponent) Kind() client.Object { 100 | return &networkingv1.NetworkPolicy{} 101 | } 102 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/podsecuritypolicy.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "fmt" 5 | 6 | rbacv1 "k8s.io/api/rbac/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | ctrl "sigs.k8s.io/controller-runtime" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | 11 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 13 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 14 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 15 | ) 16 | 17 | var ( 18 | policyAPIGroups = []string{"policy"} 19 | podSecurityPolicyResources = []string{"podsecuritypolicies"} 20 | useVerbs = []string{"use"} 21 | ) 22 | 23 | func RolePodSecurityPolicy() core.OwnedComponent { 24 | return &podSecurityPolicyComponent{kind: &rbacv1.Role{}} 25 | } 26 | 27 | func RoleBindingPodSecurityPolicy() core.OwnedComponent { 28 | return &podSecurityPolicyComponent{kind: &rbacv1.RoleBinding{}} 29 | } 30 | 31 | type podSecurityPolicyComponent struct { 32 | kind client.Object 33 | } 34 | 35 | func (c podSecurityPolicyComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 36 | cr := objToMPICluster(ctx.Object) 37 | 38 | desc, resource := c.buildResource(cr) 39 | 40 | if cr.Spec.PodSecurityPolicy == "" { 41 | return ctrl.Result{}, actions.DeleteIfExists(ctx, resource) 42 | } 43 | 44 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, resource) 45 | if err != nil { 46 | err = fmt.Errorf("cannot reconcile %s: %w", desc, err) 47 | } 48 | 49 | return ctrl.Result{}, err 50 | } 51 | 52 | func (c podSecurityPolicyComponent) Kind() client.Object { 53 | return c.kind 54 | } 55 | 56 | func (c podSecurityPolicyComponent) buildResource(cr *dcv1alpha1.MPICluster) (string, client.Object) { 57 | om := metav1.ObjectMeta{ 58 | Name: meta.InstanceName(cr, metadata.ComponentNone), 59 | Namespace: cr.Namespace, 60 | Labels: meta.StandardLabels(cr), 61 | } 62 | 63 | switch c.Kind().(type) { 64 | case *rbacv1.Role: 65 | return "role", &rbacv1.Role{ 66 | ObjectMeta: om, 67 | Rules: []rbacv1.PolicyRule{ 68 | { 69 | APIGroups: policyAPIGroups, 70 | Resources: podSecurityPolicyResources, 71 | Verbs: useVerbs, 72 | ResourceNames: []string{cr.Spec.PodSecurityPolicy}, 73 | }, 74 | }, 75 | } 76 | case *rbacv1.RoleBinding: 77 | return "role binding", &rbacv1.RoleBinding{ 78 | ObjectMeta: om, 79 | RoleRef: rbacv1.RoleRef{ 80 | APIGroup: rbacv1.GroupName, 81 | Kind: "Role", 82 | Name: om.Name, 83 | }, 84 | Subjects: []rbacv1.Subject{ 85 | { 86 | Kind: rbacv1.ServiceAccountKind, 87 | Name: serviceAccountName(cr), 88 | Namespace: cr.Namespace, 89 | }, 90 | }, 91 | } 92 | } 93 | 94 | panic("you did something dumb") 95 | } 96 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/service.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/util/intstr" 10 | ctrl "sigs.k8s.io/controller-runtime" 11 | "sigs.k8s.io/controller-runtime/pkg/client" 12 | 13 | "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 14 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 15 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 16 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 17 | ) 18 | 19 | func ServiceWorker() core.OwnedComponent { 20 | return &serviceComponent{ 21 | comp: ComponentWorker, 22 | } 23 | } 24 | 25 | func ServiceClient() core.OwnedComponent { 26 | return &serviceComponent{ 27 | comp: ComponentClient, 28 | } 29 | } 30 | 31 | type serviceComponent struct { 32 | comp metadata.Component 33 | } 34 | 35 | func (c serviceComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 36 | cr := objToMPICluster(ctx.Object) 37 | 38 | ports := []corev1.ServicePort{} 39 | var selector map[string]string 40 | var extraLabels map[string]string 41 | switch c.comp { 42 | case ComponentClient: 43 | selector = cr.Spec.NetworkPolicy.ClientLabels 44 | extraLabels = map[string]string{} 45 | case ComponentWorker: 46 | ports = append(ports, 47 | corev1.ServicePort{ 48 | Name: sshdPortName, 49 | Port: sshdPort, 50 | TargetPort: intstr.FromString(sshdPortName), 51 | Protocol: corev1.ProtocolTCP, 52 | }, 53 | corev1.ServicePort{ 54 | Name: rsyncPortName, 55 | Port: rsyncPort, 56 | TargetPort: intstr.FromString(rsyncPortName), 57 | Protocol: corev1.ProtocolTCP, 58 | }) 59 | 60 | selector = meta.MatchLabelsWithComponent(cr, c.comp) 61 | extraLabels = cr.Spec.Worker.Labels 62 | case metadata.ComponentNone: 63 | err := errors.New("unknown component for NetworkPolicy") 64 | return ctrl.Result{}, err 65 | } 66 | 67 | ports = append(ports, mpiPorts(cr)...) 68 | 69 | svc := &corev1.Service{ 70 | ObjectMeta: metav1.ObjectMeta{ 71 | Name: serviceName(cr, c.comp), 72 | Namespace: cr.Namespace, 73 | Labels: meta.StandardLabelsWithComponent(cr, c.comp, extraLabels), 74 | }, 75 | Spec: corev1.ServiceSpec{ 76 | ClusterIP: corev1.ClusterIPNone, 77 | Selector: selector, 78 | Ports: ports, 79 | }, 80 | } 81 | 82 | err := actions.CreateOrUpdateOwnedResource(ctx, cr, svc) 83 | if err != nil { 84 | err = fmt.Errorf("cannot reconcile service: %w", err) 85 | } 86 | 87 | return ctrl.Result{}, err 88 | } 89 | 90 | func (c serviceComponent) Kind() client.Object { 91 | return &corev1.Service{} 92 | } 93 | 94 | func mpiPorts(cr *v1alpha1.MPICluster) []corev1.ServicePort { 95 | ports := []corev1.ServicePort{} 96 | for idx, port := range cr.Spec.WorkerPorts { 97 | ports = append(ports, corev1.ServicePort{ 98 | Name: fmt.Sprintf("tcp-mpi-%d", idx), 99 | Port: port, 100 | TargetPort: intstr.FromInt(int(port)), 101 | }) 102 | } 103 | return ports 104 | } 105 | -------------------------------------------------------------------------------- /pkg/cluster/mpi/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package mpi 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | "k8s.io/utils/pointer" 9 | ctrl "sigs.k8s.io/controller-runtime" 10 | "sigs.k8s.io/controller-runtime/pkg/client" 11 | 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 13 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 14 | ) 15 | 16 | func ServiceAccount() core.OwnedComponent { 17 | return &serviceAccountComponent{} 18 | } 19 | 20 | type serviceAccountComponent struct{} 21 | 22 | func (c serviceAccountComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 23 | cr := objToMPICluster(ctx.Object) 24 | conf := cr.Spec.ServiceAccount 25 | 26 | sa := &corev1.ServiceAccount{ 27 | ObjectMeta: metav1.ObjectMeta{ 28 | Name: serviceAccountName(cr), 29 | Namespace: cr.Namespace, 30 | Labels: meta.StandardLabels(cr), 31 | }, 32 | AutomountServiceAccountToken: pointer.Bool(conf.AutomountServiceAccountToken), 33 | } 34 | 35 | if conf.Name != "" { 36 | return ctrl.Result{}, actions.DeleteIfExists(ctx, sa) 37 | } 38 | 39 | err := actions.CreateOrUpdateOwnedResource(ctx, cr, sa) 40 | if err != nil { 41 | err = fmt.Errorf("cannot reconcile serviceaccount: %w", err) 42 | } 43 | 44 | return ctrl.Result{}, err 45 | } 46 | 47 | func (c serviceAccountComponent) Kind() client.Object { 48 | return &corev1.ServiceAccount{} 49 | } 50 | -------------------------------------------------------------------------------- /pkg/controller/components/configmap.go: -------------------------------------------------------------------------------- 1 | package components 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | ctrl "sigs.k8s.io/controller-runtime" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | type ConfigMapDataSource interface { 15 | ConfigMap() *corev1.ConfigMap 16 | Delete() bool 17 | } 18 | 19 | type ConfigMapDataSourceFactory func(object client.Object) ConfigMapDataSource 20 | 21 | func ConfigMap(f ConfigMapDataSourceFactory) core.OwnedComponent { 22 | return &configMapComponent{factory: f} 23 | } 24 | 25 | type configMapComponent struct { 26 | factory ConfigMapDataSourceFactory 27 | } 28 | 29 | func (c *configMapComponent) Kind() client.Object { 30 | return &corev1.ConfigMap{} 31 | } 32 | 33 | func (c *configMapComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 34 | ds := c.factory(ctx.Object) 35 | cm := ds.ConfigMap() 36 | 37 | if ds.Delete() { 38 | return ctrl.Result{}, actions.DeleteIfExists(ctx, cm) 39 | } 40 | 41 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, cm) 42 | if err != nil { 43 | err = fmt.Errorf("cannot reconcile config map: %w", err) 44 | } 45 | 46 | return ctrl.Result{}, err 47 | } 48 | -------------------------------------------------------------------------------- /pkg/controller/components/horizontalpodautoscaler.go: -------------------------------------------------------------------------------- 1 | //nolint:dupl 2 | package components 3 | 4 | import ( 5 | "fmt" 6 | 7 | autoscalingv2 "k8s.io/api/autoscaling/v2" 8 | ctrl "sigs.k8s.io/controller-runtime" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | type HorizontalPodAutoscalerDataSource interface { 16 | HorizontalPodAutoscaler() *autoscalingv2.HorizontalPodAutoscaler 17 | Delete() bool 18 | } 19 | 20 | type HorizontalPodAutoscalerDataSourceFactory func(client.Object) HorizontalPodAutoscalerDataSource 21 | 22 | func HorizontalPodAutoscaler(f HorizontalPodAutoscalerDataSourceFactory) core.OwnedComponent { 23 | return &horizontalPodAutoscaler{factory: f} 24 | } 25 | 26 | type horizontalPodAutoscaler struct { 27 | factory HorizontalPodAutoscalerDataSourceFactory 28 | } 29 | 30 | func (c *horizontalPodAutoscaler) Kind() client.Object { 31 | return &autoscalingv2.HorizontalPodAutoscaler{} 32 | } 33 | 34 | func (c *horizontalPodAutoscaler) Reconcile(ctx *core.Context) (ctrl.Result, error) { 35 | ds := c.factory(ctx.Object) 36 | hpa := ds.HorizontalPodAutoscaler() 37 | 38 | if ds.Delete() { 39 | return ctrl.Result{}, actions.DeleteIfExists(ctx, hpa) 40 | } 41 | 42 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, hpa) 43 | if err != nil { 44 | err = fmt.Errorf("cannot reconcile horizontal pod autoscaler: %w", err) 45 | } 46 | 47 | return ctrl.Result{}, err 48 | } 49 | -------------------------------------------------------------------------------- /pkg/controller/components/istiopeerauthentication.go: -------------------------------------------------------------------------------- 1 | package components 2 | 3 | import ( 4 | "fmt" 5 | 6 | ctrl "sigs.k8s.io/controller-runtime" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | 9 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/resources/istio" 12 | ) 13 | 14 | type IstioPeerAuthenticationDataSource interface { 15 | PeerAuthInfo() *istio.PeerAuthInfo 16 | Enabled() bool 17 | Delete() bool 18 | } 19 | 20 | type IstioPeerAuthenticationDataSourceFactory func(client.Object) IstioPeerAuthenticationDataSource 21 | 22 | func IstioPeerAuthentication(f IstioPeerAuthenticationDataSourceFactory) core.Component { 23 | return &istioPeerAuthenticationComponent{factory: f} 24 | } 25 | 26 | type istioPeerAuthenticationComponent struct { 27 | factory IstioPeerAuthenticationDataSourceFactory 28 | } 29 | 30 | func (c *istioPeerAuthenticationComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 31 | ds := c.factory(ctx.Object) 32 | 33 | if !ds.Enabled() { 34 | return ctrl.Result{}, nil 35 | } 36 | 37 | peerAuth := istio.NewPeerAuthentication(ds.PeerAuthInfo()) 38 | if ds.Delete() { 39 | return ctrl.Result{}, actions.DeleteIfExists(ctx, peerAuth) 40 | } 41 | 42 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, peerAuth) 43 | if err != nil { 44 | err = fmt.Errorf("cannot reconcile istio peer authentication: %w", err) 45 | } 46 | 47 | return ctrl.Result{}, err 48 | } 49 | -------------------------------------------------------------------------------- /pkg/controller/components/networkpolicy.go: -------------------------------------------------------------------------------- 1 | //nolint:dupl 2 | package components 3 | 4 | import ( 5 | "fmt" 6 | 7 | networkingv1 "k8s.io/api/networking/v1" 8 | ctrl "sigs.k8s.io/controller-runtime" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | type NetworkPolicyDataSource interface { 16 | NetworkPolicy() *networkingv1.NetworkPolicy 17 | Delete() bool 18 | } 19 | 20 | type NetworkPolicyDataSourceFactory func(client.Object) NetworkPolicyDataSource 21 | 22 | func NetworkPolicy(f NetworkPolicyDataSourceFactory) core.OwnedComponent { 23 | return &networkPolicyComponent{factory: f} 24 | } 25 | 26 | type networkPolicyComponent struct { 27 | factory NetworkPolicyDataSourceFactory 28 | } 29 | 30 | func (c *networkPolicyComponent) Kind() client.Object { 31 | return &networkingv1.NetworkPolicy{} 32 | } 33 | 34 | func (c *networkPolicyComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 35 | ds := c.factory(ctx.Object) 36 | netpol := ds.NetworkPolicy() 37 | 38 | if ds.Delete() { 39 | return ctrl.Result{}, actions.DeleteIfExists(ctx, netpol) 40 | } 41 | 42 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, netpol) 43 | if err != nil { 44 | err = fmt.Errorf("cannot reconcile network policy: %w", err) 45 | } 46 | 47 | return ctrl.Result{}, err 48 | } 49 | -------------------------------------------------------------------------------- /pkg/controller/components/rbac.go: -------------------------------------------------------------------------------- 1 | package components 2 | 3 | import ( 4 | "fmt" 5 | 6 | rbacv1 "k8s.io/api/rbac/v1" 7 | ctrl "sigs.k8s.io/controller-runtime" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | type RoleDataSource interface { 15 | Role() *rbacv1.Role 16 | Delete() bool 17 | } 18 | 19 | type RoleDataSourceFactory func(client.Object) RoleDataSource 20 | 21 | func Role(f RoleDataSourceFactory) core.OwnedComponent { 22 | return &roleComponent{factory: f} 23 | } 24 | 25 | type roleComponent struct { 26 | factory RoleDataSourceFactory 27 | } 28 | 29 | func (c *roleComponent) Kind() client.Object { 30 | return &rbacv1.Role{} 31 | } 32 | 33 | func (c *roleComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 34 | ds := c.factory(ctx.Object) 35 | role := ds.Role() 36 | 37 | if ds.Delete() { 38 | return ctrl.Result{}, actions.DeleteIfExists(ctx, role) 39 | } 40 | 41 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, role) 42 | if err != nil { 43 | err = fmt.Errorf("cannot reconcile role: %w", err) 44 | } 45 | 46 | return ctrl.Result{}, err 47 | } 48 | 49 | type RoleBindingDataSource interface { 50 | RoleBinding() *rbacv1.RoleBinding 51 | Delete() bool 52 | } 53 | 54 | type RoleBindingDataSourceFactory func(client.Object) RoleBindingDataSource 55 | 56 | func RoleBinding(f RoleBindingDataSourceFactory) core.OwnedComponent { 57 | return &roleBindingComponent{factory: f} 58 | } 59 | 60 | type roleBindingComponent struct { 61 | factory RoleBindingDataSourceFactory 62 | } 63 | 64 | func (c *roleBindingComponent) Kind() client.Object { 65 | return &rbacv1.RoleBinding{} 66 | } 67 | 68 | func (c *roleBindingComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 69 | ds := c.factory(ctx.Object) 70 | rb := ds.RoleBinding() 71 | 72 | if ds.Delete() { 73 | return ctrl.Result{}, actions.DeleteIfExists(ctx, rb) 74 | } 75 | 76 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, rb) 77 | if err != nil { 78 | err = fmt.Errorf("cannot reconcile role binding: %w", err) 79 | } 80 | 81 | return ctrl.Result{}, err 82 | } 83 | -------------------------------------------------------------------------------- /pkg/controller/components/service.go: -------------------------------------------------------------------------------- 1 | package components 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | ctrl "sigs.k8s.io/controller-runtime" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | type ServiceDataSource interface { 15 | Service() *corev1.Service 16 | } 17 | 18 | type ServiceDataSourceFactory func(client.Object) ServiceDataSource 19 | 20 | func Service(f ServiceDataSourceFactory) core.OwnedComponent { 21 | return &serviceComponent{factory: f} 22 | } 23 | 24 | type serviceComponent struct { 25 | factory ServiceDataSourceFactory 26 | } 27 | 28 | func (c *serviceComponent) Kind() client.Object { 29 | return &corev1.Service{} 30 | } 31 | 32 | func (c *serviceComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 33 | ds := c.factory(ctx.Object) 34 | svc := ds.Service() 35 | 36 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, svc) 37 | if err != nil { 38 | err = fmt.Errorf("cannot reconcile service: %w", err) 39 | } 40 | 41 | return ctrl.Result{}, err 42 | } 43 | -------------------------------------------------------------------------------- /pkg/controller/components/serviceaccount.go: -------------------------------------------------------------------------------- 1 | //nolint:dupl 2 | package components 3 | 4 | import ( 5 | "fmt" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | ctrl "sigs.k8s.io/controller-runtime" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 13 | ) 14 | 15 | type ServiceAccountDataSource interface { 16 | ServiceAccount() *corev1.ServiceAccount 17 | Delete() bool 18 | } 19 | 20 | type ServiceAccountDataSourceFactory func(client.Object) ServiceAccountDataSource 21 | 22 | func ServiceAccount(f ServiceAccountDataSourceFactory) core.OwnedComponent { 23 | return &serviceAccountComponent{factory: f} 24 | } 25 | 26 | type serviceAccountComponent struct { 27 | factory ServiceAccountDataSourceFactory 28 | } 29 | 30 | func (c *serviceAccountComponent) Kind() client.Object { 31 | return &corev1.ServiceAccount{} 32 | } 33 | 34 | func (c *serviceAccountComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 35 | ds := c.factory(ctx.Object) 36 | sa := ds.ServiceAccount() 37 | 38 | if ds.Delete() { 39 | return ctrl.Result{}, actions.DeleteIfExists(ctx, sa) 40 | } 41 | 42 | err := actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, sa) 43 | if err != nil { 44 | err = fmt.Errorf("cannot reconcile service account: %w", err) 45 | } 46 | 47 | return ctrl.Result{}, err 48 | } 49 | -------------------------------------------------------------------------------- /pkg/controller/components/statefulset.go: -------------------------------------------------------------------------------- 1 | package components 2 | 3 | import ( 4 | "fmt" 5 | 6 | appsv1 "k8s.io/api/apps/v1" 7 | ctrl "sigs.k8s.io/controller-runtime" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | 10 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/actions" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/controller/core" 12 | ) 13 | 14 | type StatefulSetDataSource interface { 15 | StatefulSet() (*appsv1.StatefulSet, error) 16 | PVCListOpts() []client.ListOption 17 | } 18 | 19 | type StatefulSetDataSourceFactory func(client.Object) StatefulSetDataSource 20 | 21 | func StatefulSet(f StatefulSetDataSourceFactory) core.OwnedComponent { 22 | return &statefulSetComponent{factory: f} 23 | } 24 | 25 | type statefulSetComponent struct { 26 | factory StatefulSetDataSourceFactory 27 | } 28 | 29 | func (c *statefulSetComponent) Kind() client.Object { 30 | return &appsv1.StatefulSet{} 31 | } 32 | 33 | func (c *statefulSetComponent) Reconcile(ctx *core.Context) (ctrl.Result, error) { 34 | ds := c.factory(ctx.Object) 35 | 36 | sts, err := ds.StatefulSet() 37 | if err != nil { 38 | return ctrl.Result{}, fmt.Errorf("failed to build statefulset: %w", err) 39 | } 40 | 41 | err = actions.CreateOrUpdateOwnedResource(ctx, ctx.Object, sts) 42 | if err != nil { 43 | err = fmt.Errorf("cannot reconcile stateful set: %w", err) 44 | } 45 | 46 | return ctrl.Result{}, err 47 | } 48 | 49 | func (c *statefulSetComponent) Finalize(ctx *core.Context) (ctrl.Result, bool, error) { 50 | ds := c.factory(ctx.Object) 51 | err := actions.DeleteStorage(ctx, ds.PVCListOpts()) 52 | 53 | return ctrl.Result{}, err == nil, err 54 | } 55 | -------------------------------------------------------------------------------- /pkg/controller/core/components.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | ctrl "sigs.k8s.io/controller-runtime" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | ) 7 | 8 | type Component interface { 9 | Reconcile(*Context) (ctrl.Result, error) 10 | } 11 | 12 | type OwnedComponent interface { 13 | Component 14 | Kind() client.Object 15 | } 16 | 17 | type FinalizerComponent interface { 18 | Finalize(*Context) (ctrl.Result, bool, error) 19 | } 20 | -------------------------------------------------------------------------------- /pkg/controller/core/context.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/go-logr/logr" 7 | "k8s.io/apimachinery/pkg/runtime" 8 | "k8s.io/client-go/tools/record" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | ) 11 | 12 | type Context struct { 13 | context.Context 14 | 15 | Log logr.Logger 16 | Object client.Object 17 | Client client.Client 18 | Scheme *runtime.Scheme 19 | Recorder record.EventRecorder 20 | Patch *Patch 21 | } 22 | -------------------------------------------------------------------------------- /pkg/controller/core/patch.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "path" 5 | 6 | "github.com/banzaicloud/k8s-objectmatcher/patch" 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | ) 9 | 10 | var defaultCalculateOpts = []patch.CalculateOption{ 11 | patch.IgnoreStatusFields(), 12 | patch.IgnoreVolumeClaimTemplateTypeMetaAndStatus(), 13 | } 14 | 15 | type Patch struct { 16 | Annotator *patch.Annotator 17 | Maker patch.Maker 18 | CalculateOpts []patch.CalculateOption 19 | } 20 | 21 | func NewPatch(gvk schema.GroupVersionKind) *Patch { 22 | a := patch.NewAnnotator(path.Join(gvk.Group, "last-applied")) 23 | m := patch.NewPatchMaker(a, &patch.K8sStrategicMergePatcher{}, &patch.BaseJSONMergePatcher{}) 24 | 25 | return &Patch{ 26 | Annotator: a, 27 | Maker: m, 28 | CalculateOpts: defaultCalculateOpts, 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /pkg/crd/istio.go: -------------------------------------------------------------------------------- 1 | package crd 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/hashicorp/go-retryablehttp" 7 | ) 8 | 9 | const ( 10 | checkURL = "http://localhost:15021/healthz/ready" 11 | finishURL = "http://localhost:15020/quitquitquit" 12 | ) 13 | 14 | var retryClient *retryablehttp.Client 15 | 16 | func waitForIstioSidecar() (func(), error) { 17 | log.Info("Checking istio sidecar") 18 | resp, err := retryClient.Head(checkURL) 19 | if err != nil { 20 | log.Error(err, "Istio sidecar is not ready") 21 | return nil, err 22 | } 23 | defer resp.Body.Close() 24 | 25 | log.Info("Istio sidecar available") 26 | fn := func() { 27 | log.Info("Triggering istio termination") 28 | _, _ = retryClient.Post(finishURL, "", nil) 29 | } 30 | 31 | return fn, err 32 | } 33 | 34 | func init() { 35 | retryClient = retryablehttp.NewClient() 36 | retryClient.RetryMax = 10 37 | retryClient.RetryWaitMin = 1 * time.Second 38 | retryClient.RetryWaitMax = 1 * time.Second 39 | } 40 | -------------------------------------------------------------------------------- /pkg/resources/istio/peerauthentication.go: -------------------------------------------------------------------------------- 1 | package istio 2 | 3 | import ( 4 | securityv1beta1 "istio.io/api/security/v1beta1" 5 | "istio.io/api/type/v1beta1" 6 | istio "istio.io/client-go/pkg/apis/security/v1beta1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | ) 9 | 10 | // PeerAuthInfo defines fields used to generate Istio PeerAuthentication objects. 11 | type PeerAuthInfo struct { 12 | Name string 13 | Namespace string 14 | Labels map[string]string 15 | Selector map[string]string 16 | Mode string 17 | } 18 | 19 | // NewPeerAuthentication uses PeerAuthInfo to generate and return a new PeerAuthentication object. 20 | func NewPeerAuthentication(info *PeerAuthInfo) *istio.PeerAuthentication { 21 | modeVal := securityv1beta1.PeerAuthentication_MutualTLS_Mode_value[info.Mode] 22 | 23 | return &istio.PeerAuthentication{ 24 | ObjectMeta: metav1.ObjectMeta{ 25 | Name: info.Name, 26 | Namespace: info.Namespace, 27 | Labels: info.Labels, 28 | }, 29 | Spec: securityv1beta1.PeerAuthentication{ 30 | Selector: &v1beta1.WorkloadSelector{ 31 | MatchLabels: info.Selector, 32 | }, 33 | Mtls: &securityv1beta1.PeerAuthentication_MutualTLS{ 34 | Mode: securityv1beta1.PeerAuthentication_MutualTLS_Mode(modeVal), 35 | }, 36 | }, 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pkg/resources/istio/peerauthentication_test.go: -------------------------------------------------------------------------------- 1 | package istio 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | securityv1beta1 "istio.io/api/security/v1beta1" 8 | "istio.io/api/type/v1beta1" 9 | istio "istio.io/client-go/pkg/apis/security/v1beta1" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | ) 12 | 13 | func TestNewPeerAuthentication(t *testing.T) { 14 | testcases := []struct { 15 | smode string 16 | mode securityv1beta1.PeerAuthentication_MutualTLS_Mode 17 | }{ 18 | {"UNSET", securityv1beta1.PeerAuthentication_MutualTLS_UNSET}, 19 | {"DISABLE", securityv1beta1.PeerAuthentication_MutualTLS_DISABLE}, 20 | {"PERMISSIVE", securityv1beta1.PeerAuthentication_MutualTLS_PERMISSIVE}, 21 | {"STRICT", securityv1beta1.PeerAuthentication_MutualTLS_STRICT}, 22 | {"GARBAGE", securityv1beta1.PeerAuthentication_MutualTLS_UNSET}, 23 | } 24 | for _, tc := range testcases { 25 | info := &PeerAuthInfo{ 26 | Name: "cluster", 27 | Namespace: "ns", 28 | Labels: map[string]string{ 29 | "awesome": "true", 30 | }, 31 | Selector: map[string]string{ 32 | "app.kubernetes.io/name": "compute-r", 33 | }, 34 | Mode: tc.smode, 35 | } 36 | actual := NewPeerAuthentication(info) 37 | 38 | expected := &istio.PeerAuthentication{ 39 | ObjectMeta: metav1.ObjectMeta{ 40 | Name: "cluster", 41 | Namespace: "ns", 42 | Labels: map[string]string{ 43 | "awesome": "true", 44 | }, 45 | }, 46 | Spec: securityv1beta1.PeerAuthentication{ 47 | Selector: &v1beta1.WorkloadSelector{ 48 | MatchLabels: map[string]string{ 49 | "app.kubernetes.io/name": "compute-r", 50 | }, 51 | }, 52 | Mtls: &securityv1beta1.PeerAuthentication_MutualTLS{ 53 | Mode: tc.mode, 54 | }, 55 | }, 56 | } 57 | 58 | assert.Equal(t, expected, actual) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pkg/resources/metadata.go: -------------------------------------------------------------------------------- 1 | package resources 2 | 3 | const ( 4 | // ApplicationNameLabelKey indicates the name of the application. 5 | ApplicationNameLabelKey = "app.kubernetes.io/name" 6 | // ApplicationInstanceLabelKey indicates a unique name identifying the instance of an application. 7 | ApplicationInstanceLabelKey = "app.kubernetes.io/instance" 8 | // ApplicationVersionLabelKey indicates the current version of the application. 9 | ApplicationVersionLabelKey = "app.kubernetes.io/version" 10 | // ApplicationComponentLabelKey indicates the component within the architecture of an application. 11 | ApplicationComponentLabelKey = "app.kubernetes.io/component" 12 | // ApplicationManagedByLabelKey indicates the tool being used to manage the operation of an application. 13 | ApplicationManagedByLabelKey = "app.kubernetes.io/managed-by" 14 | // ApplicationManagedByLabelValue is the specific tool being used to manage applications created by this project. 15 | ApplicationManagedByLabelValue = "distributed-compute-operator" 16 | // DescriptionAnnotationKey can be used to add extra information to a Kubernetes object via its annotations. 17 | DescriptionAnnotationKey = "distributed-compute.dominodatalab.com/description" 18 | ) 19 | 20 | // MetadataLabels returns a map used to label Kubernetes resources. 21 | func MetadataLabels(name, instance, version string) map[string]string { 22 | return map[string]string{ 23 | ApplicationNameLabelKey: name, 24 | ApplicationInstanceLabelKey: instance, 25 | ApplicationVersionLabelKey: version, 26 | ApplicationManagedByLabelKey: ApplicationManagedByLabelValue, 27 | } 28 | } 29 | 30 | // MetadataLabelsWithComponent returns a map used to label Kubernetes resources that act as unique components. 31 | func MetadataLabelsWithComponent(name, instance, version, component string) map[string]string { 32 | labels := MetadataLabels(name, instance, version) 33 | labels[ApplicationComponentLabelKey] = component 34 | 35 | return labels 36 | } 37 | 38 | // SelectorLabels returns a map used to select Kubernetes objects that have 39 | // been labeled with output from MetadataLabels. 40 | func SelectorLabels(name, instance string) map[string]string { 41 | return map[string]string{ 42 | ApplicationNameLabelKey: name, 43 | ApplicationInstanceLabelKey: instance, 44 | } 45 | } 46 | 47 | // SelectorLabelsWithComponent returns a map used to select Kubernetes objects 48 | // that have been labeled with output from MetadataLabelsWithComponent. 49 | func SelectorLabelsWithComponent(name, instance, component string) map[string]string { 50 | labels := SelectorLabels(name, instance) 51 | labels[ApplicationComponentLabelKey] = component 52 | 53 | return labels 54 | } 55 | -------------------------------------------------------------------------------- /pkg/resources/metadata_test.go: -------------------------------------------------------------------------------- 1 | package resources 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestMetadataLabels(t *testing.T) { 10 | actual := MetadataLabels("my-app", "inst", "v1.0.0") 11 | expected := map[string]string{ 12 | "app.kubernetes.io/name": "my-app", 13 | "app.kubernetes.io/instance": "inst", 14 | "app.kubernetes.io/version": "v1.0.0", 15 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 16 | } 17 | 18 | assert.Equal(t, expected, actual) 19 | } 20 | 21 | func TestMetadataLabelsWithComponent(t *testing.T) { 22 | actual := MetadataLabelsWithComponent("my-app", "inst", "v1.0.0", "comp") 23 | expected := map[string]string{ 24 | "app.kubernetes.io/name": "my-app", 25 | "app.kubernetes.io/instance": "inst", 26 | "app.kubernetes.io/version": "v1.0.0", 27 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 28 | "app.kubernetes.io/component": "comp", 29 | } 30 | 31 | assert.Equal(t, expected, actual) 32 | } 33 | 34 | func TestSelectorLabels(t *testing.T) { 35 | actual := SelectorLabels("my-app", "inst") 36 | expected := map[string]string{ 37 | "app.kubernetes.io/name": "my-app", 38 | "app.kubernetes.io/instance": "inst", 39 | } 40 | 41 | assert.Equal(t, expected, actual) 42 | } 43 | 44 | func TestSelectorLabelsWithComponent(t *testing.T) { 45 | actual := SelectorLabelsWithComponent("my-app", "inst", "comp") 46 | expected := map[string]string{ 47 | "app.kubernetes.io/name": "my-app", 48 | "app.kubernetes.io/instance": "inst", 49 | "app.kubernetes.io/component": "comp", 50 | } 51 | 52 | assert.Equal(t, expected, actual) 53 | } 54 | -------------------------------------------------------------------------------- /pkg/resources/ray/helpers_test.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/utils/pointer" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | ) 10 | 11 | // rayClusterFixture should be used for all ray unit testing. 12 | func rayClusterFixture() *dcv1alpha1.RayCluster { 13 | return &dcv1alpha1.RayCluster{ 14 | TypeMeta: metav1.TypeMeta{ 15 | Kind: "RayCluster", 16 | APIVersion: "distributed-compute.dominodatalab.com/v1test1", 17 | }, 18 | ObjectMeta: metav1.ObjectMeta{ 19 | Name: "test-id", 20 | Namespace: "fake-ns", 21 | }, 22 | Spec: dcv1alpha1.RayClusterSpec{ 23 | ScalableClusterConfig: dcv1alpha1.ScalableClusterConfig{ 24 | ClusterConfig: dcv1alpha1.ClusterConfig{ 25 | Image: &dcv1alpha1.OCIImageDefinition{ 26 | Registry: "fake-reg", 27 | Repository: "fake-repo", 28 | Tag: "fake-tag", 29 | PullPolicy: corev1.PullIfNotPresent, 30 | }, 31 | }, 32 | }, 33 | Port: 6379, 34 | RedisShardPorts: []int32{ 35 | 6380, 36 | 6381, 37 | }, 38 | ClientServerPort: 10001, 39 | ObjectManagerPort: 2384, 40 | NodeManagerPort: 2385, 41 | GCSServerPort: 2386, 42 | WorkerPorts: []int32{11000, 11001}, 43 | DashboardPort: 8265, 44 | Worker: dcv1alpha1.RayClusterWorker{ 45 | Replicas: pointer.Int32(5), 46 | }, 47 | }, 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pkg/resources/ray/horizontalpodautoscaler.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | "fmt" 5 | 6 | autoscalingv2 "k8s.io/api/autoscaling/v2" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | 10 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 11 | ) 12 | 13 | // NewHorizontalPodAutoscaler generates an HPA that targets a RayCluster resource. 14 | // 15 | // The metrics-server needs to be launched separately and the worker stateful 16 | // set requires cpu resource requests in order for this object to have any 17 | // effect. 18 | func NewHorizontalPodAutoscaler(rc *dcv1alpha1.RayCluster) (*autoscalingv2.HorizontalPodAutoscaler, error) { 19 | autoscaling := rc.Spec.Autoscaling 20 | if autoscaling == nil { 21 | return nil, fmt.Errorf("cannot build HPA without autoscaling config") 22 | } 23 | 24 | var behavior *autoscalingv2.HorizontalPodAutoscalerBehavior 25 | if autoscaling.ScaleDownStabilizationWindowSeconds != nil { 26 | behavior = &autoscalingv2.HorizontalPodAutoscalerBehavior{ 27 | ScaleDown: &autoscalingv2.HPAScalingRules{ 28 | StabilizationWindowSeconds: autoscaling.ScaleDownStabilizationWindowSeconds, 29 | }, 30 | } 31 | } 32 | 33 | var metrics []autoscalingv2.MetricSpec 34 | if autoscaling.AverageCPUUtilization != nil { 35 | metrics = append(metrics, autoscalingv2.MetricSpec{ 36 | Type: autoscalingv2.ResourceMetricSourceType, 37 | Resource: &autoscalingv2.ResourceMetricSource{ 38 | Name: corev1.ResourceCPU, 39 | Target: autoscalingv2.MetricTarget{ 40 | Type: autoscalingv2.UtilizationMetricType, 41 | AverageUtilization: autoscaling.AverageCPUUtilization, 42 | }, 43 | }, 44 | }) 45 | } 46 | if autoscaling.AverageMemoryUtilization != nil { 47 | metrics = append(metrics, autoscalingv2.MetricSpec{ 48 | Type: autoscalingv2.ResourceMetricSourceType, 49 | Resource: &autoscalingv2.ResourceMetricSource{ 50 | Name: corev1.ResourceMemory, 51 | Target: autoscalingv2.MetricTarget{ 52 | Type: autoscalingv2.UtilizationMetricType, 53 | AverageUtilization: autoscaling.AverageMemoryUtilization, 54 | }, 55 | }, 56 | }) 57 | } 58 | 59 | hpa := &autoscalingv2.HorizontalPodAutoscaler{ 60 | ObjectMeta: HorizontalPodAutoscalerObjectMeta(rc), 61 | Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ 62 | ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ 63 | APIVersion: rc.APIVersion, 64 | Kind: rc.Kind, 65 | Name: rc.Name, 66 | }, 67 | MinReplicas: autoscaling.MinReplicas, 68 | MaxReplicas: autoscaling.MaxReplicas, 69 | Metrics: metrics, 70 | Behavior: behavior, 71 | }, 72 | } 73 | 74 | return hpa, nil 75 | } 76 | 77 | // HorizontalPodAutoscalerObjectMeta returns the ObjectMeta object used to identify new HPA objects. 78 | func HorizontalPodAutoscalerObjectMeta(rc *dcv1alpha1.RayCluster) metav1.ObjectMeta { 79 | return metav1.ObjectMeta{ 80 | Name: InstanceObjectName(rc.Name, ComponentNone), 81 | Namespace: rc.Namespace, 82 | Labels: AddGlobalLabels(MetadataLabels(rc), rc.Spec.GlobalLabels), 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /pkg/resources/ray/podsecuritypolicy.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 8 | ) 9 | 10 | var ( 11 | policyAPIGroups = []string{"policy"} 12 | podSecurityPolicyResources = []string{"podsecuritypolicies"} 13 | useVerbs = []string{"use"} 14 | ) 15 | 16 | // NewPodSecurityPolicyRBAC generates the role and role binding required to use a pod security policy. 17 | // The role is bound to the service account used by the ray cluster pods. 18 | func NewPodSecurityPolicyRBAC(rc *dcv1alpha1.RayCluster) (*rbacv1.Role, *rbacv1.RoleBinding) { 19 | name := InstanceObjectName(rc.Name, ComponentNone) 20 | 21 | role := &rbacv1.Role{ 22 | ObjectMeta: metav1.ObjectMeta{ 23 | Name: name, 24 | Namespace: rc.Namespace, 25 | Labels: AddGlobalLabels(MetadataLabels(rc), rc.Spec.GlobalLabels), 26 | }, 27 | Rules: []rbacv1.PolicyRule{ 28 | { 29 | APIGroups: policyAPIGroups, 30 | Resources: podSecurityPolicyResources, 31 | Verbs: useVerbs, 32 | ResourceNames: []string{rc.Spec.PodSecurityPolicy}, 33 | }, 34 | }, 35 | } 36 | 37 | binding := &rbacv1.RoleBinding{ 38 | ObjectMeta: metav1.ObjectMeta{ 39 | Name: name, 40 | Namespace: rc.Namespace, 41 | Labels: AddGlobalLabels(MetadataLabels(rc), rc.Spec.GlobalLabels), 42 | }, 43 | RoleRef: rbacv1.RoleRef{ 44 | APIGroup: rbacv1.GroupName, 45 | Kind: "Role", 46 | Name: role.Name, 47 | }, 48 | Subjects: []rbacv1.Subject{ 49 | { 50 | Kind: rbacv1.ServiceAccountKind, 51 | Name: InstanceObjectName(rc.Name, ComponentNone), 52 | Namespace: rc.Namespace, 53 | }, 54 | }, 55 | } 56 | 57 | return role, binding 58 | } 59 | -------------------------------------------------------------------------------- /pkg/resources/ray/podsecuritypolicy_test.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | rbacv1 "k8s.io/api/rbac/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ) 10 | 11 | func TestNewPodSecurityPolicyRBAC(t *testing.T) { 12 | rc := rayClusterFixture() 13 | rc.Spec.PodSecurityPolicy = "test-psp" 14 | role, roleBinding := NewPodSecurityPolicyRBAC(rc) 15 | 16 | t.Run("role", func(t *testing.T) { 17 | expected := &rbacv1.Role{ 18 | ObjectMeta: metav1.ObjectMeta{ 19 | Name: "test-id-ray", 20 | Namespace: "fake-ns", 21 | Labels: map[string]string{ 22 | "app.kubernetes.io/name": "ray", 23 | "app.kubernetes.io/instance": "test-id", 24 | "app.kubernetes.io/version": "fake-tag", 25 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 26 | }, 27 | }, 28 | Rules: []rbacv1.PolicyRule{ 29 | { 30 | APIGroups: []string{"policy"}, 31 | Resources: []string{"podsecuritypolicies"}, 32 | Verbs: []string{"use"}, 33 | ResourceNames: []string{"test-psp"}, 34 | }, 35 | }, 36 | } 37 | assert.Equal(t, expected, role) 38 | }) 39 | 40 | t.Run("role_binding", func(t *testing.T) { 41 | expected := &rbacv1.RoleBinding{ 42 | ObjectMeta: metav1.ObjectMeta{ 43 | Name: "test-id-ray", 44 | Namespace: "fake-ns", 45 | Labels: map[string]string{ 46 | "app.kubernetes.io/name": "ray", 47 | "app.kubernetes.io/instance": "test-id", 48 | "app.kubernetes.io/version": "fake-tag", 49 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 50 | }, 51 | }, 52 | RoleRef: rbacv1.RoleRef{ 53 | APIGroup: "rbac.authorization.k8s.io", 54 | Kind: "Role", 55 | Name: "test-id-ray", 56 | }, 57 | Subjects: []rbacv1.Subject{ 58 | { 59 | Kind: "ServiceAccount", 60 | Name: "test-id-ray", 61 | Namespace: "fake-ns", 62 | }, 63 | }, 64 | } 65 | assert.Equal(t, expected, roleBinding) 66 | }) 67 | } 68 | -------------------------------------------------------------------------------- /pkg/resources/ray/ray.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | "fmt" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | 8 | "github.com/dominodatalab/distributed-compute-operator/pkg/cluster/metadata" 9 | 10 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 11 | "github.com/dominodatalab/distributed-compute-operator/pkg/resources" 12 | "github.com/dominodatalab/distributed-compute-operator/pkg/util" 13 | ) 14 | 15 | // Component is used to drive Kubernetes object generation for different ray types. 16 | type Component string 17 | 18 | const ( 19 | // ComponentNone indicates a generic ray resource. 20 | ComponentNone Component = "none" 21 | // ComponentHead indicates a ray head resource. 22 | ComponentHead Component = "head" 23 | // ComponentWorker indicates a ray worker resource. 24 | ComponentWorker Component = "worker" 25 | // ApplicationName defines the static name used to generate ray object metadata. 26 | ApplicationName = "ray" 27 | ) 28 | 29 | // InstanceObjectName returns the name that will be used to create most owned cluster resources. 30 | func InstanceObjectName(instance string, comp Component) string { 31 | if comp == ComponentNone { 32 | return fmt.Sprintf("%s-%s", instance, ApplicationName) 33 | } 34 | 35 | return fmt.Sprintf("%s-%s-%s", instance, ApplicationName, comp) 36 | } 37 | 38 | // HeadlessHeadServiceName returns the name of the headless service used to 39 | // register the head ray pod. 40 | func HeadlessHeadServiceName(name string) string { 41 | return InstanceObjectName(name, ComponentHead) 42 | } 43 | 44 | // HeadlessWorkerServiceName returns the name of the headless service used to 45 | // register ray worker pods. 46 | func HeadlessWorkerServiceName(name string) string { 47 | return InstanceObjectName(name, ComponentWorker) 48 | } 49 | 50 | // MetadataLabels returns standard metadata for ray resources. 51 | func MetadataLabels(rc *dcv1alpha1.RayCluster) map[string]string { 52 | return resources.MetadataLabels(ApplicationName, rc.Name, rc.Spec.Image.Tag) 53 | } 54 | 55 | // MetadataLabelsWithComponent returns standard component metadata for ray resources. 56 | func MetadataLabelsWithComponent(rc *dcv1alpha1.RayCluster, comp Component) map[string]string { 57 | return resources.MetadataLabelsWithComponent(ApplicationName, rc.Name, rc.Spec.Image.Tag, string(comp)) 58 | } 59 | 60 | // SelectorLabels returns a resource selector clause for ray resources. 61 | func SelectorLabels(rc *dcv1alpha1.RayCluster) map[string]string { 62 | return resources.SelectorLabels(ApplicationName, rc.Name) 63 | } 64 | 65 | // SelectorLabelsWithComponent returns a resource component selector clause for ray resources. 66 | func SelectorLabelsWithComponent(rc *dcv1alpha1.RayCluster, comp Component) map[string]string { 67 | return resources.SelectorLabelsWithComponent(ApplicationName, rc.Name, string(comp)) 68 | } 69 | 70 | func AddGlobalLabels(labels map[string]string, globalLabels map[string]string) map[string]string { 71 | if globalLabels != nil { 72 | labels = util.MergeStringMaps(globalLabels, labels) 73 | } 74 | return labels 75 | } 76 | 77 | var Meta = metadata.NewProvider( 78 | ApplicationName, 79 | func(obj client.Object) string { return objToRayCluster(obj).Spec.Image.Tag }, 80 | func(obj client.Object) map[string]string { return objToRayCluster(obj).Spec.GlobalLabels }, 81 | ) 82 | 83 | func objToRayCluster(obj client.Object) *dcv1alpha1.RayCluster { 84 | return obj.(*dcv1alpha1.RayCluster) 85 | } 86 | -------------------------------------------------------------------------------- /pkg/resources/ray/ray_test.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestInstanceObjectName(t *testing.T) { 10 | t.Run("with_component", func(t *testing.T) { 11 | comp := Component("test") 12 | actual := InstanceObjectName("steve-o", comp) 13 | assert.Equal(t, "steve-o-ray-test", actual) 14 | }) 15 | 16 | t.Run("component_none", func(t *testing.T) { 17 | actual := InstanceObjectName("steve-o", ComponentNone) 18 | assert.Equal(t, "steve-o-ray", actual) 19 | }) 20 | } 21 | 22 | func TestHeadlessHeadServiceName(t *testing.T) { 23 | actual := HeadlessHeadServiceName("steve-o") 24 | assert.Equal(t, "steve-o-ray-head", actual) 25 | } 26 | 27 | func TestHeadlessWorkerServiceName(t *testing.T) { 28 | actual := HeadlessWorkerServiceName("steve-o") 29 | assert.Equal(t, "steve-o-ray-worker", actual) 30 | } 31 | 32 | func TestMetadataLabels(t *testing.T) { 33 | rc := rayClusterFixture() 34 | actual := MetadataLabels(rc) 35 | 36 | expected := map[string]string{ 37 | "app.kubernetes.io/name": "ray", 38 | "app.kubernetes.io/instance": "test-id", 39 | "app.kubernetes.io/version": "fake-tag", 40 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 41 | } 42 | assert.Equal(t, expected, actual) 43 | } 44 | 45 | func TestMetadataLabelsWithComponent(t *testing.T) { 46 | rc := rayClusterFixture() 47 | actual := MetadataLabelsWithComponent(rc, "something") 48 | 49 | expected := map[string]string{ 50 | "app.kubernetes.io/name": "ray", 51 | "app.kubernetes.io/instance": "test-id", 52 | "app.kubernetes.io/version": "fake-tag", 53 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 54 | "app.kubernetes.io/component": "something", 55 | } 56 | assert.Equal(t, expected, actual) 57 | } 58 | 59 | func TestSelectorLabels(t *testing.T) { 60 | rc := rayClusterFixture() 61 | actual := SelectorLabels(rc) 62 | 63 | expected := map[string]string{ 64 | "app.kubernetes.io/name": "ray", 65 | "app.kubernetes.io/instance": "test-id", 66 | } 67 | assert.Equal(t, expected, actual) 68 | } 69 | 70 | func TestSelectorLabelsWithComponent(t *testing.T) { 71 | rc := rayClusterFixture() 72 | actual := SelectorLabelsWithComponent(rc, "something") 73 | 74 | expected := map[string]string{ 75 | "app.kubernetes.io/name": "ray", 76 | "app.kubernetes.io/instance": "test-id", 77 | "app.kubernetes.io/component": "something", 78 | } 79 | assert.Equal(t, expected, actual) 80 | } 81 | -------------------------------------------------------------------------------- /pkg/resources/ray/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/utils/pointer" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | ) 10 | 11 | // NewServiceAccount generates a service account resource without API access. 12 | func NewServiceAccount(rc *dcv1alpha1.RayCluster) *corev1.ServiceAccount { 13 | return &corev1.ServiceAccount{ 14 | ObjectMeta: metav1.ObjectMeta{ 15 | Name: InstanceObjectName(rc.Name, ComponentNone), 16 | Namespace: rc.Namespace, 17 | Labels: AddGlobalLabels(MetadataLabels(rc), rc.Spec.GlobalLabels), 18 | }, 19 | AutomountServiceAccountToken: pointer.Bool(false), 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /pkg/resources/ray/serviceaccount_test.go: -------------------------------------------------------------------------------- 1 | package ray 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/utils/pointer" 10 | ) 11 | 12 | func TestNewServiceAccount(t *testing.T) { 13 | rc := rayClusterFixture() 14 | sa := NewServiceAccount(rc) 15 | 16 | expected := &corev1.ServiceAccount{ 17 | ObjectMeta: metav1.ObjectMeta{ 18 | Name: "test-id-ray", 19 | Namespace: "fake-ns", 20 | Labels: map[string]string{ 21 | "app.kubernetes.io/name": "ray", 22 | "app.kubernetes.io/instance": "test-id", 23 | "app.kubernetes.io/version": "fake-tag", 24 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 25 | }, 26 | }, 27 | AutomountServiceAccountToken: pointer.Bool(false), 28 | } 29 | assert.Equal(t, expected, sa) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/resources/spark/configmap.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | 11 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 12 | ) 13 | 14 | // NewFrameworkConfigMap generates a configmap which represents a spark-defaults.conf file out of provided config 15 | func NewFrameworkConfigMap(sc *dcv1alpha1.SparkCluster) *corev1.ConfigMap { 16 | data := map[string]string{} 17 | if sc.Spec.Master.DefaultConfiguration != nil { 18 | data[string(ComponentMaster)] = generateSparkDefaults(sc.Spec.Master.DefaultConfiguration) 19 | } 20 | if sc.Spec.Worker.DefaultConfiguration != nil { 21 | data[string(ComponentWorker)] = generateSparkDefaults(sc.Spec.Worker.DefaultConfiguration) 22 | } 23 | if len(data) == 0 { 24 | return nil 25 | } 26 | return &corev1.ConfigMap{ 27 | ObjectMeta: metav1.ObjectMeta{ 28 | Name: FrameworkConfigMapName(sc.Name, ComponentNone), 29 | Namespace: sc.Namespace, 30 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 31 | }, 32 | Data: data, 33 | } 34 | } 35 | 36 | // NewKeyTabConfigMap generates a configmap which represents the Kerberos KeyTab configuration out of provided config 37 | func NewKeyTabConfigMap(sc *dcv1alpha1.SparkCluster) *corev1.ConfigMap { 38 | binaryData := map[string][]byte{} 39 | 40 | if sc.Spec.KerberosKeytab != nil { 41 | binaryData["keytab"] = sc.Spec.KerberosKeytab.Contents 42 | } 43 | 44 | if len(binaryData) == 0 { 45 | return nil 46 | } 47 | 48 | return &corev1.ConfigMap{ 49 | ObjectMeta: metav1.ObjectMeta{ 50 | Name: KeyTabConfigMapName(sc.Name, ComponentNone), 51 | Namespace: sc.Namespace, 52 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 53 | }, 54 | BinaryData: binaryData, 55 | } 56 | } 57 | 58 | // looks a little weird because map iteration isn't stable in go, but we want to provide a stable interface 59 | // so we sort the keys and emit a config in sorted order 60 | func generateSparkDefaults(defaults map[string]string) string { 61 | var keys []string 62 | for k := range defaults { 63 | keys = append(keys, k) 64 | } 65 | sort.Strings(keys) 66 | b := strings.Builder{} 67 | for _, k := range keys { 68 | b.WriteString(fmt.Sprintf("%s %s\n", k, defaults[k])) 69 | } 70 | return b.String() 71 | } 72 | -------------------------------------------------------------------------------- /pkg/resources/spark/envoyfilter.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "fmt" 5 | 6 | spb "google.golang.org/protobuf/types/known/structpb" 7 | networkingv1alpha3 "istio.io/api/networking/v1alpha3" 8 | apinetworkingv1alpha3 "istio.io/client-go/pkg/apis/networking/v1alpha3" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | 11 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 12 | ) 13 | 14 | const filterName = "envoy.filters.network.tcp_proxy" 15 | 16 | // NewEnvoyFilter creates a new EnvoyFilter resource to set idle_timeout for Istio-enabled deployments 17 | func NewEnvoyFilter(sc *dcv1alpha1.SparkCluster) *apinetworkingv1alpha3.EnvoyFilter { 18 | match := networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectMatch{ 19 | Context: networkingv1alpha3.EnvoyFilter_ANY, 20 | ObjectTypes: &networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectMatch_Listener{ 21 | Listener: &networkingv1alpha3.EnvoyFilter_ListenerMatch{ 22 | FilterChain: &networkingv1alpha3.EnvoyFilter_ListenerMatch_FilterChainMatch{ 23 | Filter: &networkingv1alpha3.EnvoyFilter_ListenerMatch_FilterMatch{ 24 | Name: filterName, 25 | }, 26 | }, 27 | }, 28 | }, 29 | } 30 | 31 | patch := networkingv1alpha3.EnvoyFilter_Patch{ 32 | Operation: networkingv1alpha3.EnvoyFilter_Patch_MERGE, 33 | Value: &spb.Struct{ 34 | Fields: map[string]*spb.Value{ 35 | "name": { 36 | Kind: &spb.Value_StringValue{ 37 | StringValue: "envoy.filters.network.tcp_proxy", 38 | }, 39 | }, 40 | "typed_config": { 41 | Kind: &spb.Value_StructValue{ 42 | StructValue: &spb.Struct{ 43 | Fields: map[string]*spb.Value{ 44 | "@type": { 45 | Kind: &spb.Value_StringValue{ 46 | StringValue: "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy", 47 | }, 48 | }, 49 | "idle_timeout": { 50 | Kind: &spb.Value_StringValue{ 51 | StringValue: "0s", 52 | }, 53 | }, 54 | }, 55 | }, 56 | }, 57 | }, 58 | }, 59 | }, 60 | } 61 | 62 | configPatches := []*networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectPatch{ 63 | { 64 | ApplyTo: networkingv1alpha3.EnvoyFilter_NETWORK_FILTER, 65 | Match: &match, 66 | Patch: &patch, 67 | }, 68 | } 69 | 70 | workloadSelector := networkingv1alpha3.WorkloadSelector{ 71 | Labels: sc.Spec.EnvoyFilterLabels, 72 | } 73 | 74 | envoyFilter := &apinetworkingv1alpha3.EnvoyFilter{ 75 | TypeMeta: metav1.TypeMeta{}, 76 | ObjectMeta: metav1.ObjectMeta{ 77 | Name: fmt.Sprintf("%s-%s", InstanceObjectName(sc.Name, ComponentNone), "envoyfilter"), 78 | Namespace: sc.Namespace, 79 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Labels), 80 | }, 81 | Spec: networkingv1alpha3.EnvoyFilter{ 82 | WorkloadSelector: &workloadSelector, 83 | ConfigPatches: configPatches, 84 | }, 85 | } 86 | 87 | return envoyFilter 88 | } 89 | -------------------------------------------------------------------------------- /pkg/resources/spark/envoyfilter_test.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | spb "google.golang.org/protobuf/types/known/structpb" 8 | networkingv1alpha3 "istio.io/api/networking/v1alpha3" 9 | apinetworkingv1alpha3 "istio.io/client-go/pkg/apis/networking/v1alpha3" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | ) 12 | 13 | func TestNewEnvoyFilter(t *testing.T) { 14 | t.Run("default", func(t *testing.T) { 15 | sc := sparkClusterFixture() 16 | actual := NewEnvoyFilter(sc) 17 | 18 | patch := networkingv1alpha3.EnvoyFilter_Patch{ 19 | Operation: networkingv1alpha3.EnvoyFilter_Patch_MERGE, 20 | Value: &spb.Struct{ 21 | Fields: map[string]*spb.Value{ 22 | "name": { 23 | Kind: &spb.Value_StringValue{ 24 | StringValue: "envoy.filters.network.tcp_proxy", 25 | }, 26 | }, 27 | "typed_config": { 28 | Kind: &spb.Value_StructValue{ 29 | StructValue: &spb.Struct{ 30 | Fields: map[string]*spb.Value{ 31 | "@type": { 32 | Kind: &spb.Value_StringValue{ 33 | StringValue: "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy", 34 | }, 35 | }, 36 | "idle_timeout": { 37 | Kind: &spb.Value_StringValue{ 38 | StringValue: "0s", 39 | }, 40 | }, 41 | }, 42 | }, 43 | }, 44 | }, 45 | }, 46 | }, 47 | } 48 | 49 | configPatches := []*networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectPatch{ 50 | { 51 | ApplyTo: networkingv1alpha3.EnvoyFilter_NETWORK_FILTER, 52 | Match: &networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectMatch{ 53 | Context: networkingv1alpha3.EnvoyFilter_ANY, 54 | ObjectTypes: &networkingv1alpha3.EnvoyFilter_EnvoyConfigObjectMatch_Listener{ 55 | Listener: &networkingv1alpha3.EnvoyFilter_ListenerMatch{ 56 | FilterChain: &networkingv1alpha3.EnvoyFilter_ListenerMatch_FilterChainMatch{ 57 | Filter: &networkingv1alpha3.EnvoyFilter_ListenerMatch_FilterMatch{ 58 | Name: "envoy.filters.network.tcp_proxy", 59 | }, 60 | }, 61 | }, 62 | }, 63 | }, 64 | Patch: &patch, 65 | }, 66 | } 67 | 68 | workloadSelector := networkingv1alpha3.WorkloadSelector{ 69 | Labels: sc.Spec.EnvoyFilterLabels, 70 | } 71 | 72 | expected := &apinetworkingv1alpha3.EnvoyFilter{ 73 | TypeMeta: metav1.TypeMeta{}, 74 | ObjectMeta: metav1.ObjectMeta{ 75 | Name: "test-id-spark-envoyfilter", 76 | Namespace: sc.Namespace, 77 | Labels: MetadataLabels(sc), 78 | }, 79 | Spec: networkingv1alpha3.EnvoyFilter{ 80 | WorkloadSelector: &workloadSelector, 81 | ConfigPatches: configPatches, 82 | }, 83 | } 84 | 85 | assert.Equal(t, expected, actual, "Istio EnvoyFilter not correctly generated") 86 | }) 87 | } 88 | -------------------------------------------------------------------------------- /pkg/resources/spark/helpers_test.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/utils/pointer" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | ) 10 | 11 | // sparkClusterFixture should be used for all spark unit testing. 12 | func sparkClusterFixture() *dcv1alpha1.SparkCluster { 13 | return &dcv1alpha1.SparkCluster{ 14 | TypeMeta: metav1.TypeMeta{ 15 | Kind: "SparkCluster", 16 | APIVersion: "distributed-compute.dominodatalab.com/v1test1", 17 | }, 18 | ObjectMeta: metav1.ObjectMeta{ 19 | Name: "test-id", 20 | Namespace: "fake-ns", 21 | }, 22 | Spec: dcv1alpha1.SparkClusterSpec{ 23 | ScalableClusterConfig: dcv1alpha1.ScalableClusterConfig{ 24 | ClusterConfig: dcv1alpha1.ClusterConfig{ 25 | Image: &dcv1alpha1.OCIImageDefinition{ 26 | Registry: "fake-reg", 27 | Repository: "fake-repo", 28 | Tag: "fake-tag", 29 | PullPolicy: corev1.PullIfNotPresent, 30 | }, 31 | }, 32 | }, 33 | ClusterPort: 7077, 34 | MasterWebPort: 8080, 35 | WorkerWebPort: 8081, 36 | WorkerMemoryLimit: "4505m", 37 | Worker: dcv1alpha1.SparkClusterWorker{ 38 | Replicas: pointer.Int32(5), 39 | }, 40 | }, 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pkg/resources/spark/horizontalpodautoscaler.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "fmt" 5 | 6 | autoscalingv2 "k8s.io/api/autoscaling/v2" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | 10 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 11 | ) 12 | 13 | // NewHorizontalPodAutoscaler generates an HPA that targets a SparkCluster resource. 14 | // 15 | // The metrics-server needs to be launched separately and the worker deployment 16 | // requires cpu resource requests in order for this object to have any effect. 17 | func NewHorizontalPodAutoscaler(sc *dcv1alpha1.SparkCluster) (*autoscalingv2.HorizontalPodAutoscaler, error) { 18 | autoscaling := sc.Spec.Autoscaling 19 | if autoscaling == nil { 20 | return nil, fmt.Errorf("cannot build HPA without autoscaling config") 21 | } 22 | 23 | var behavior *autoscalingv2.HorizontalPodAutoscalerBehavior 24 | if autoscaling.ScaleDownStabilizationWindowSeconds != nil { 25 | behavior = &autoscalingv2.HorizontalPodAutoscalerBehavior{ 26 | ScaleDown: &autoscalingv2.HPAScalingRules{ 27 | StabilizationWindowSeconds: autoscaling.ScaleDownStabilizationWindowSeconds, 28 | }, 29 | } 30 | } 31 | 32 | var metrics []autoscalingv2.MetricSpec 33 | if autoscaling.AverageCPUUtilization != nil { 34 | metrics = append(metrics, autoscalingv2.MetricSpec{ 35 | Type: autoscalingv2.ResourceMetricSourceType, 36 | Resource: &autoscalingv2.ResourceMetricSource{ 37 | Name: corev1.ResourceCPU, 38 | Target: autoscalingv2.MetricTarget{ 39 | Type: autoscalingv2.UtilizationMetricType, 40 | AverageUtilization: autoscaling.AverageCPUUtilization, 41 | }, 42 | }, 43 | }) 44 | } 45 | if autoscaling.AverageMemoryUtilization != nil { 46 | metrics = append(metrics, autoscalingv2.MetricSpec{ 47 | Type: autoscalingv2.ResourceMetricSourceType, 48 | Resource: &autoscalingv2.ResourceMetricSource{ 49 | Name: corev1.ResourceMemory, 50 | Target: autoscalingv2.MetricTarget{ 51 | Type: autoscalingv2.UtilizationMetricType, 52 | AverageUtilization: autoscaling.AverageMemoryUtilization, 53 | }, 54 | }, 55 | }) 56 | } 57 | 58 | hpa := &autoscalingv2.HorizontalPodAutoscaler{ 59 | ObjectMeta: HorizontalPodAutoscalerObjectMeta(sc), 60 | Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ 61 | ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ 62 | APIVersion: sc.APIVersion, 63 | Kind: sc.Kind, 64 | Name: sc.Name, 65 | }, 66 | MinReplicas: autoscaling.MinReplicas, 67 | MaxReplicas: autoscaling.MaxReplicas, 68 | Metrics: metrics, 69 | Behavior: behavior, 70 | }, 71 | } 72 | 73 | return hpa, nil 74 | } 75 | 76 | // HorizontalPodAutoscalerObjectMeta returns the ObjectMeta object used to identify new HPA objects. 77 | func HorizontalPodAutoscalerObjectMeta(sc *dcv1alpha1.SparkCluster) metav1.ObjectMeta { 78 | return metav1.ObjectMeta{ 79 | Name: InstanceObjectName(sc.Name, ComponentNone), 80 | Namespace: sc.Namespace, 81 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /pkg/resources/spark/podsecuritypolicy.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 8 | ) 9 | 10 | var ( 11 | policyAPIGroups = []string{"policy"} 12 | podSecurityPolicyResources = []string{"podsecuritypolicies"} 13 | useVerbs = []string{"use"} 14 | ) 15 | 16 | // NewPodSecurityPolicyRBAC generates the role and role binding required to use a pod security policy. 17 | // The role is bound to the service account used by the spark cluster pods. 18 | func NewPodSecurityPolicyRBAC(sc *dcv1alpha1.SparkCluster) (*rbacv1.Role, *rbacv1.RoleBinding) { 19 | name := InstanceObjectName(sc.Name, ComponentNone) 20 | 21 | role := &rbacv1.Role{ 22 | ObjectMeta: metav1.ObjectMeta{ 23 | Name: name, 24 | Namespace: sc.Namespace, 25 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 26 | }, 27 | Rules: []rbacv1.PolicyRule{ 28 | { 29 | APIGroups: policyAPIGroups, 30 | Resources: podSecurityPolicyResources, 31 | Verbs: useVerbs, 32 | ResourceNames: []string{sc.Spec.PodSecurityPolicy}, 33 | }, 34 | }, 35 | } 36 | 37 | binding := &rbacv1.RoleBinding{ 38 | ObjectMeta: metav1.ObjectMeta{ 39 | Name: name, 40 | Namespace: sc.Namespace, 41 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 42 | }, 43 | RoleRef: rbacv1.RoleRef{ 44 | APIGroup: rbacv1.GroupName, 45 | Kind: "Role", 46 | Name: role.Name, 47 | }, 48 | Subjects: []rbacv1.Subject{ 49 | { 50 | Kind: rbacv1.ServiceAccountKind, 51 | Name: InstanceObjectName(sc.Name, ComponentNone), 52 | Namespace: sc.Namespace, 53 | }, 54 | }, 55 | } 56 | 57 | return role, binding 58 | } 59 | -------------------------------------------------------------------------------- /pkg/resources/spark/podsecuritypolicy_test.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | rbacv1 "k8s.io/api/rbac/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ) 10 | 11 | func TestNewPodSecurityPolicyRBAC(t *testing.T) { 12 | rc := sparkClusterFixture() 13 | rc.Spec.PodSecurityPolicy = "test-psp" 14 | role, roleBinding := NewPodSecurityPolicyRBAC(rc) 15 | 16 | t.Run("role", func(t *testing.T) { 17 | expected := &rbacv1.Role{ 18 | ObjectMeta: metav1.ObjectMeta{ 19 | Name: "test-id-spark", 20 | Namespace: "fake-ns", 21 | Labels: map[string]string{ 22 | "app.kubernetes.io/name": "spark", 23 | "app.kubernetes.io/instance": "test-id", 24 | "app.kubernetes.io/version": "fake-tag", 25 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 26 | }, 27 | }, 28 | Rules: []rbacv1.PolicyRule{ 29 | { 30 | APIGroups: []string{"policy"}, 31 | Resources: []string{"podsecuritypolicies"}, 32 | Verbs: []string{"use"}, 33 | ResourceNames: []string{"test-psp"}, 34 | }, 35 | }, 36 | } 37 | assert.Equal(t, expected, role) 38 | }) 39 | 40 | t.Run("role_binding", func(t *testing.T) { 41 | expected := &rbacv1.RoleBinding{ 42 | ObjectMeta: metav1.ObjectMeta{ 43 | Name: "test-id-spark", 44 | Namespace: "fake-ns", 45 | Labels: map[string]string{ 46 | "app.kubernetes.io/name": "spark", 47 | "app.kubernetes.io/instance": "test-id", 48 | "app.kubernetes.io/version": "fake-tag", 49 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 50 | }, 51 | }, 52 | RoleRef: rbacv1.RoleRef{ 53 | APIGroup: "rbac.authorization.k8s.io", 54 | Kind: "Role", 55 | Name: "test-id-spark", 56 | }, 57 | Subjects: []rbacv1.Subject{ 58 | { 59 | Kind: "ServiceAccount", 60 | Name: "test-id-spark", 61 | Namespace: "fake-ns", 62 | }, 63 | }, 64 | } 65 | assert.Equal(t, expected, roleBinding) 66 | }) 67 | } 68 | -------------------------------------------------------------------------------- /pkg/resources/spark/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/utils/pointer" 7 | 8 | dcv1alpha1 "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 9 | ) 10 | 11 | // NewServiceAccount generates a service account resource without API access. 12 | func NewServiceAccount(sc *dcv1alpha1.SparkCluster) *corev1.ServiceAccount { 13 | return &corev1.ServiceAccount{ 14 | ObjectMeta: metav1.ObjectMeta{ 15 | Name: InstanceObjectName(sc.Name, ComponentNone), 16 | Namespace: sc.Namespace, 17 | Labels: AddGlobalLabels(MetadataLabels(sc), sc.Spec.GlobalLabels), 18 | }, 19 | AutomountServiceAccountToken: pointer.Bool(false), 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /pkg/resources/spark/serviceaccount_test.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/utils/pointer" 10 | ) 11 | 12 | func TestNewServiceAccount(t *testing.T) { 13 | rc := sparkClusterFixture() 14 | sa := NewServiceAccount(rc) 15 | 16 | expected := &corev1.ServiceAccount{ 17 | ObjectMeta: metav1.ObjectMeta{ 18 | Name: "test-id-spark", 19 | Namespace: "fake-ns", 20 | Labels: map[string]string{ 21 | "app.kubernetes.io/name": "spark", 22 | "app.kubernetes.io/instance": "test-id", 23 | "app.kubernetes.io/version": "fake-tag", 24 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 25 | }, 26 | }, 27 | AutomountServiceAccountToken: pointer.Bool(false), 28 | } 29 | assert.Equal(t, expected, sa) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/resources/spark/spark_test.go: -------------------------------------------------------------------------------- 1 | package spark 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestHeadServiceName(t *testing.T) { 10 | actual := MasterServiceName("steve-o") 11 | assert.Equal(t, "steve-o-spark-master", actual) 12 | } 13 | 14 | func TestInstanceObjectName(t *testing.T) { 15 | t.Run("with_component", func(t *testing.T) { 16 | comp := Component("test") 17 | actual := InstanceObjectName("steve-o", comp) 18 | assert.Equal(t, "steve-o-spark-test", actual) 19 | }) 20 | 21 | t.Run("component_none", func(t *testing.T) { 22 | actual := InstanceObjectName("steve-o", ComponentNone) 23 | assert.Equal(t, "steve-o-spark", actual) 24 | }) 25 | } 26 | 27 | func TestMetadataLabels(t *testing.T) { 28 | rc := sparkClusterFixture() 29 | actual := MetadataLabels(rc) 30 | 31 | expected := map[string]string{ 32 | "app.kubernetes.io/name": "spark", 33 | "app.kubernetes.io/instance": "test-id", 34 | "app.kubernetes.io/version": "fake-tag", 35 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 36 | } 37 | assert.Equal(t, expected, actual) 38 | } 39 | 40 | func TestMetadataLabelsWithComponent(t *testing.T) { 41 | rc := sparkClusterFixture() 42 | actual := MetadataLabelsWithComponent(rc, Component("something")) 43 | 44 | expected := map[string]string{ 45 | "app.kubernetes.io/name": "spark", 46 | "app.kubernetes.io/instance": "test-id", 47 | "app.kubernetes.io/version": "fake-tag", 48 | "app.kubernetes.io/managed-by": "distributed-compute-operator", 49 | "app.kubernetes.io/component": "something", 50 | } 51 | assert.Equal(t, expected, actual) 52 | } 53 | 54 | func TestSelectorLabels(t *testing.T) { 55 | rc := sparkClusterFixture() 56 | actual := SelectorLabels(rc) 57 | 58 | expected := map[string]string{ 59 | "app.kubernetes.io/name": "spark", 60 | "app.kubernetes.io/instance": "test-id", 61 | } 62 | assert.Equal(t, expected, actual) 63 | } 64 | 65 | func TestSelectorLabelsWithComponent(t *testing.T) { 66 | rc := sparkClusterFixture() 67 | actual := SelectorLabelsWithComponent(rc, Component("something")) 68 | 69 | expected := map[string]string{ 70 | "app.kubernetes.io/name": "spark", 71 | "app.kubernetes.io/instance": "test-id", 72 | "app.kubernetes.io/component": "something", 73 | } 74 | assert.Equal(t, expected, actual) 75 | } 76 | 77 | func TestFrameworkConfigMapName(t *testing.T) { 78 | rc := sparkClusterFixture() 79 | actual := FrameworkConfigMapName(rc.Name, Component("something")) 80 | 81 | expected := "test-id-framework-spark-something" 82 | 83 | assert.Equal(t, expected, actual) 84 | } 85 | 86 | func TestKeyTabConfigMapName(t *testing.T) { 87 | rc := sparkClusterFixture() 88 | actual := KeyTabConfigMapName(rc.Name, Component("something")) 89 | 90 | expected := "test-id-keytab-spark-something" 91 | 92 | assert.Equal(t, expected, actual) 93 | } 94 | -------------------------------------------------------------------------------- /pkg/util/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | 7 | "github.com/distribution/reference" 8 | 9 | "github.com/dominodatalab/distributed-compute-operator/api/v1alpha1" 10 | ) 11 | 12 | // IntsToStrings converts an integer slice into a string slice. 13 | func IntsToStrings(is []int32) (ss []string) { 14 | for _, i := range is { 15 | ss = append(ss, strconv.Itoa(int(i))) 16 | } 17 | return 18 | } 19 | 20 | // MergeStringMaps merges the src map into the dst. 21 | func MergeStringMaps(src, dst map[string]string) map[string]string { 22 | for k, v := range src { 23 | dst[k] = v 24 | } 25 | return dst 26 | } 27 | 28 | // ParseImageDefinition generates a fully-qualified image reference to an OCI image. 29 | // An error will be returned when the image definition is invalid. 30 | func ParseImageDefinition(def *v1alpha1.OCIImageDefinition) (string, error) { 31 | ref := def.Repository 32 | 33 | if def.Registry != "" { 34 | ref = fmt.Sprintf("%s/%s", def.Registry, ref) 35 | } 36 | if def.Tag != "" { 37 | ref = fmt.Sprintf("%s:%s", ref, def.Tag) 38 | } 39 | 40 | named, err := reference.ParseNormalizedNamed(ref) 41 | if err != nil { 42 | return "", fmt.Errorf("invalid OCIImageDefinition: %w", err) 43 | } 44 | named = reference.TagNameOnly(named) 45 | 46 | return named.String(), nil 47 | } 48 | 49 | // BoolPtrIsTrue returns true if bool pointer is true. This returns false if 50 | // pointer is false or nil. 51 | func BoolPtrIsTrue(ptr *bool) bool { 52 | return ptr != nil && *ptr 53 | } 54 | 55 | // BoolPtrIsNilOrFalse returns true if bool pointer is nil or false, otherwise 56 | // this returns false. 57 | func BoolPtrIsNilOrFalse(ptr *bool) bool { 58 | return ptr == nil || !*ptr 59 | } 60 | 61 | // GetIndexFromSlice returns the index of a specific string in a slice or -1 if the value is not present. 62 | func GetIndexFromSlice(s []string, match string) int { 63 | for index, val := range s { 64 | if val == match { 65 | return index 66 | } 67 | } 68 | return -1 69 | } 70 | 71 | // RemoveFromSlice removes index i from slice s. Does not maintain order of the original slice. 72 | // https://stackoverflow.com/a/37335777/13979167 73 | func RemoveFromSlice(s []string, i int) []string { 74 | if i >= 0 && i < len(s) { 75 | s[len(s)-1], s[i] = s[i], s[len(s)-1] 76 | return s[:len(s)-1] 77 | } 78 | return s 79 | } 80 | -------------------------------------------------------------------------------- /scripts/hotpatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | IMAGE_NAME=${IMAGE_NAME:-"quay.io/domino/distributed-compute-operator"} 3 | IMAGE_TAG_PREFIX=${IMAGE_TAG_PREFIX:-"dev-"} 4 | latest_tag="$IMAGE_TAG_PREFIX$(date +%s)" 5 | image="$IMAGE_NAME:$latest_tag" 6 | make manifests generate docker-build IMG="$image" 7 | 8 | declare -r COMPUTE_NAMESPACE=$(kubectl get namespaces -ojson | jq -rc '.items[] | select(.metadata.name | endswith("-compute")) | .metadata.name') 9 | 10 | docker push $image 11 | 12 | helm upgrade \ 13 | distributed-compute-operator \ 14 | deploy/helm/distributed-compute-operator \ 15 | --install \ 16 | -n $COMPUTE_NAMESPACE \ 17 | --set image.registry="quay.io" \ 18 | --set image.repository="domino/distributed-compute-operator" \ 19 | --set image.tag="$latest_tag" \ 20 | --set config.logDevelopmentMode=true \ 21 | --set istio.enabled=true \ 22 | --set istio.cniPluginInstalled=true \ 23 | --set networkPolicy.enabled=true \ 24 | -------------------------------------------------------------------------------- /scripts/release/before-hook.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Performs actions required prior to a release. 4 | 5 | set -eu 6 | 7 | # ensure dependencies are in-sync prior to builds 8 | go mod tidy 9 | 10 | # ensure crds are up-to-date 11 | make manifests 12 | 13 | # copy crds into a known location for goreleaser 14 | dir=custom-resource-definitions 15 | mkdir -p $dir 16 | cp config/crd/bases/*.yaml $dir 17 | -------------------------------------------------------------------------------- /scripts/release/helm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Functions used to log into helm registries, and package/push project chart. 4 | 5 | set -euo pipefail 6 | 7 | HELM_BIN=${HELM_BIN:-helm} 8 | 9 | function dco::helm::login() { 10 | local registry="$1" 11 | local username="$2" 12 | local password="$3" 13 | local namespace="$4" 14 | 15 | echo "$password" | $HELM_BIN registry login "$registry" \ 16 | --namespace "$namespace" \ 17 | --username "$username" \ 18 | --password-stdin 19 | } 20 | 21 | function dco::helm::push() { 22 | local registry=$1 23 | local version=$2 24 | local semantic_version 25 | local chart_path 26 | 27 | if [[ $version =~ ^(pr-[[:digit:]]+|main)$ ]]; then 28 | semantic_version="0.0.0-$version" 29 | else 30 | semantic_version=$version 31 | fi 32 | 33 | $HELM_BIN package deploy/helm/distributed-compute-operator \ 34 | --destination chart-archives \ 35 | --app-version "$version" \ 36 | --version "$semantic_version" 37 | 38 | chart_path="chart-archives/distributed-compute-operator-$semantic_version.tgz" 39 | 40 | $HELM_BIN push "$chart_path" oci://"$registry" 41 | 42 | rm -rf chart-archives/ 43 | } 44 | 45 | function dco::helm::main() { 46 | local command=$1 47 | shift 48 | 49 | case $command in 50 | login) 51 | local host="" 52 | local username="" 53 | local password="" 54 | local namespace="" 55 | local usage 56 | 57 | usage="usage: $(basename "$0") login -h HOST -u USERNAME -p PASSWORD [-n NAMESPACE]" 58 | while getopts h:u:p:n: opt; do 59 | case $opt in 60 | h) 61 | host=$OPTARG 62 | ;; 63 | u) 64 | username=$OPTARG 65 | ;; 66 | p) 67 | password=$OPTARG 68 | ;; 69 | n) 70 | namespace=$OPTARG 71 | ;; 72 | *) 73 | echo "$usage" 74 | exit 1 75 | esac 76 | done 77 | shift $((OPTIND -1)) 78 | 79 | if [[ -z $host ]] || [[ -z $username ]] || [[ -z $password ]]; then 80 | echo "$usage" 81 | exit 1 82 | fi 83 | 84 | dco::helm::login "$host" "$username" "$password" "$namespace" 85 | ;; 86 | push) 87 | local registry="" 88 | local version="" 89 | local usage 90 | 91 | usage="usage: $(basename "$0") push -r REGISTRY -v VERSION" 92 | while getopts r:v: opt; do 93 | case $opt in 94 | r) 95 | registry=$OPTARG 96 | ;; 97 | v) 98 | version=$OPTARG 99 | ;; 100 | *) 101 | echo "$usage" 102 | exit 1 103 | esac 104 | done 105 | shift $((OPTIND -1)) 106 | 107 | if [[ -z $registry ]] || [[ -z $version ]]; then 108 | echo "$usage" 109 | exit 1 110 | fi 111 | 112 | dco::helm::push "$registry" "$version" 113 | ;; 114 | ""|help) 115 | echo 116 | echo "Usage: $(basename "$0") COMMAND ARGS" 117 | echo 118 | echo "Commands:" 119 | echo " login Authenticate with remote registry" 120 | echo " push Build and upload chart to a remote registry" 121 | echo " help Display usage" 122 | exit 1 123 | esac 124 | } 125 | 126 | if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then 127 | dco::helm::main "${@:-""}" 128 | fi 129 | -------------------------------------------------------------------------------- /test/test.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "path/filepath" 5 | "runtime" 6 | ) 7 | 8 | // MissingAssetsWarning is a hint as to why an envtest environment will not start. 9 | const MissingAssetsWarning = "Ensure required testing binaries are present by running `make test-assets`" 10 | 11 | // KubebuilderBinaryAssetsDir returns a path where control plane binaries required by envtest should be installed. 12 | // TODO: figure out whether to remove this or update it; it no longer works. 13 | func KubebuilderBinaryAssetsDir() string { 14 | _, b, _, _ := runtime.Caller(0) 15 | return filepath.Join(filepath.Dir(b), "..", "testbin", "bin") 16 | } 17 | --------------------------------------------------------------------------------