├── config ├── certmanager │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ └── certificate.yaml ├── webhook │ ├── kustomization.yaml │ ├── service.yaml │ ├── kustomizeconfig.yaml │ └── manifests_stable.yaml ├── rbac │ ├── service_account.yaml │ ├── auth_proxy_client_clusterrole.yaml │ ├── di_server_service.yaml │ ├── auth_proxy_service.yaml │ ├── auth_proxy_role_binding.yaml │ ├── leader_election_role_binding.yaml │ ├── role_binding.yaml │ ├── auth_proxy_role.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ └── role.yaml ├── manager │ ├── di_config.yaml │ ├── kustomization.yaml │ ├── di_server.yaml │ ├── di_webhook.yaml │ └── di_operator.yaml ├── crd │ ├── patches │ │ ├── cainjection_in_dijobs.yaml │ │ └── webhook_in_dijobs.yaml │ ├── kustomizeconfig.yaml │ ├── kustomization.yaml │ └── minimal │ │ └── crds.yaml ├── default │ ├── manager_auth_proxy_patch.yaml │ ├── manager_config_patch.yaml │ ├── manager_webhook_patch.yaml │ ├── webhookcainjection_patch.yaml │ └── kustomization.yaml └── samples │ ├── dijob-serial.yaml │ ├── atari-dqn-dist.yaml │ └── atari-dqn-dist-config.yaml ├── docs ├── images │ ├── di-engine-arch.png │ ├── di-engine-schedule.png │ ├── di-engine-status-machine.png │ └── client-go-controller-interaction.jpeg ├── developer-guide.md └── architecture-cn.md ├── .dockerignore ├── chart ├── templates │ ├── config.yaml │ ├── servicemonitor.yaml │ ├── service.yaml │ ├── rbac.yaml │ └── deployment.yaml ├── values.yaml └── Chart.yaml ├── hack ├── update-image-tags.sh ├── boilerplate.go.txt ├── update-version.sh ├── update-codegen.sh └── update_replicas.go ├── test └── e2e │ ├── README.md │ ├── jobs │ ├── normal-job-sleep.yaml │ ├── normal-job.yaml │ ├── name-validate-task-repeat.yaml │ ├── name-validate-without-name.yaml │ ├── name-validate-name-repeat.yaml │ └── name-validate-none-type-task-without-name.yaml │ └── e2e_suite_test.go ├── pkg ├── utils │ ├── testutils │ │ ├── const.go │ │ ├── dijob.go │ │ └── pod.go │ ├── filters.go │ ├── validator.go │ └── util.go ├── server │ ├── utils.go │ ├── types │ │ ├── types.go │ │ └── error.go │ ├── server.go │ ├── handlers.go │ ├── processor.go │ └── suite_test.go ├── allocator │ ├── types │ │ ├── policy.go │ │ ├── fit_policy.go │ │ └── infos.go │ ├── node.go │ ├── job.go │ └── allocator.go ├── context │ ├── context.go │ ├── job.go │ └── node.go ├── common │ ├── event_handler.go │ ├── handler │ │ └── event_handler.go │ ├── config.go │ └── config_test.go ├── api │ └── v2alpha1 │ │ ├── groupversion_info.go │ │ ├── zz_generated.deepcopy.go │ │ └── dijob_types.go └── controllers │ ├── suite_test.go │ ├── handler.go │ ├── dijob_controller.go │ └── dijob_test.go ├── go.mod ├── Dockerfile.dev ├── PROJECT ├── .gitignore ├── e2e ├── README.md └── e2e_suite_test.go ├── main.go ├── Dockerfile ├── cmd ├── common │ └── common.go ├── root.go ├── server │ └── server.go └── operator │ └── operator.go ├── .golangci.yml ├── .github └── workflows │ ├── build.yaml │ └── release.yaml ├── README.md └── Makefile /config/certmanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - certificate.yaml 3 | 4 | configurations: 5 | - kustomizeconfig.yaml -------------------------------------------------------------------------------- /docs/images/di-engine-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/DI-orchestrator/HEAD/docs/images/di-engine-arch.png -------------------------------------------------------------------------------- /docs/images/di-engine-schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/DI-orchestrator/HEAD/docs/images/di-engine-schedule.png -------------------------------------------------------------------------------- /docs/images/di-engine-status-machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/DI-orchestrator/HEAD/docs/images/di-engine-status-machine.png -------------------------------------------------------------------------------- /config/webhook/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manifests_stable.yaml 3 | - service.yaml 4 | 5 | configurations: 6 | - kustomizeconfig.yaml 7 | -------------------------------------------------------------------------------- /config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | name: di-orchestrator 6 | name: di-orchestrator -------------------------------------------------------------------------------- /docs/images/client-go-controller-interaction.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/DI-orchestrator/HEAD/docs/images/client-go-controller-interaction.jpeg -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore all files which are not go type 3 | !**/*.go 4 | !**/*.mod 5 | !**/*.sum 6 | -------------------------------------------------------------------------------- /config/manager/di_config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: di-config 5 | data: 6 | DI_JOB_DEFAULT_RESOURCES: '{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}' 7 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: di-metrics-reader 5 | rules: 6 | - nonResourceURLs: ["/metrics"] 7 | verbs: ["get"] 8 | -------------------------------------------------------------------------------- /chart/templates/config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | DI_JOB_DEFAULT_RESOURCES: '{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}' 4 | kind: ConfigMap 5 | metadata: 6 | name: di-config 7 | namespace: {{ .Release.Namespace }} 8 | -------------------------------------------------------------------------------- /config/webhook/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: di-webhook-service 6 | namespace: di-system 7 | spec: 8 | ports: 9 | - port: 443 10 | targetPort: 9443 11 | selector: 12 | control-plane: di-webhook 13 | -------------------------------------------------------------------------------- /config/rbac/di_server_service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | labels: 6 | control-plane: di-server 7 | name: di-server 8 | spec: 9 | selector: 10 | control-plane: di-server 11 | ports: 12 | - protocol: TCP 13 | port: 8081 14 | targetPort: 8081 15 | -------------------------------------------------------------------------------- /hack/update-image-tags.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu -o pipefail 3 | 4 | dir=$1 5 | image_tag=$2 6 | 7 | find "$dir" -type f -name '*.yaml' | while read -r f; do 8 | echo "$f" 9 | sed "s|opendilab/di-orchestrator:.*|opendilab/di-orchestrator:${image_tag}|" "$f" >.tmp 10 | mv .tmp "$f" 11 | done 12 | -------------------------------------------------------------------------------- /test/e2e/README.md: -------------------------------------------------------------------------------- 1 | # E2E Tests 2 | Through the e2e test, we can test the robustness of DI-engine, ensuring that DIJobs can tolerate common exceptions. 3 | 4 | ## Prerequisites 5 | A well prepared kubernetes cluster and the di-orchestrator has been installed. 6 | ## Run 7 | ```bash 8 | make test-e2e 9 | ``` 10 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: di-operator 6 | name: di-operator-metrics-service 7 | spec: 8 | ports: 9 | - name: https 10 | port: 8443 11 | targetPort: 8443 12 | selector: 13 | control-plane: di-operator 14 | -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - di_config.yaml 3 | - di_operator.yaml 4 | - di_server.yaml 5 | # - di_webhook.yaml 6 | apiVersion: kustomize.config.k8s.io/v1beta1 7 | kind: Kustomization 8 | images: 9 | - name: opendilab/di-orchestrator 10 | newName: opendilab/di-orchestrator 11 | newTag: v1.1.3 12 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: di-proxy-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: di-proxy-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: di-orchestrator 12 | -------------------------------------------------------------------------------- /pkg/utils/testutils/const.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | const ( 4 | DefaultAGConfigNamespace = "di-system" 5 | DefaultAGConfigName = "aggregator-config" 6 | 7 | DIJobName = "dijob-example" 8 | DIJobNamespace = "default" 9 | DIJobImage = "alpine:latest" 10 | DefaultSleepDuration = "2s" 11 | ) 12 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: di-leader-election-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: di-leader-election-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: di-orchestrator 12 | -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: di-operator-cluster-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: di-operator-cluster-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: di-orchestrator 12 | -------------------------------------------------------------------------------- /config/crd/patches/cainjection_in_dijobs.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 7 | name: dijobs.diengine.opendilab.org 8 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: di-proxy-role 5 | rules: 6 | - apiGroups: ["authentication.k8s.io"] 7 | resources: 8 | - tokenreviews 9 | verbs: ["create"] 10 | - apiGroups: ["authorization.k8s.io"] 11 | resources: 12 | - subjectaccessreviews 13 | verbs: ["create"] 14 | -------------------------------------------------------------------------------- /pkg/server/utils.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | func parseJobID(jobID string) (namespace, name string, err error) { 9 | items := strings.Split(jobID, ".") 10 | if len(items) != 2 { 11 | return "", "", fmt.Errorf("job id %s must be in namespace.name format", jobID) 12 | } 13 | return items[0], items[1], nil 14 | } 15 | -------------------------------------------------------------------------------- /pkg/server/types/types.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type DIJobRequest struct { 4 | Replicas int `json:"replicas"` 5 | } 6 | 7 | type Object interface{} 8 | 9 | type Response struct { 10 | Success bool `json:"success"` 11 | Code int `json:"code"` 12 | Message string `json:"message"` 13 | Data Object `json:"data"` 14 | } 15 | 16 | const ( 17 | CodeSuccess = iota 18 | CodeFailed 19 | ) 20 | -------------------------------------------------------------------------------- /pkg/allocator/types/policy.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type NodeList []string 4 | 5 | // Policy interface defines two functions to handle single job allocation and global jobs optimization. 6 | type Policy interface { 7 | Allocate(job JobInfo, nodes map[string]*NodeInfo) (NodeList, error) 8 | Optimize(jobs map[string]JobInfo, nodes map[string]*NodeInfo, prevAllocations map[string]NodeList) (map[string]NodeList, error) 9 | } 10 | -------------------------------------------------------------------------------- /chart/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for chart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | # tag for pytorch-operator image 6 | tag: v1.1.3 7 | 8 | # tag for di-orchestrator image 9 | registry: opendilab 10 | serviceDomainName: svc.cluster.local 11 | 12 | operatorName: di-operator 13 | serverName: di-server 14 | serverPort: 8081 15 | 16 | qps: 100 17 | 18 | burst: 200 19 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module opendilab.org/di-orchestrator 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/gin-gonic/gin v1.7.7 7 | github.com/go-logr/logr v0.4.0 8 | github.com/onsi/ginkgo v1.16.4 9 | github.com/onsi/gomega v1.15.0 10 | github.com/spf13/cobra v1.2.1 11 | github.com/spf13/pflag v1.0.5 12 | k8s.io/api v0.21.5 13 | k8s.io/apimachinery v0.21.5 14 | k8s.io/client-go v0.21.5 15 | k8s.io/kubectl v0.21.5 16 | sigs.k8s.io/controller-runtime v0.9.7 17 | ) 18 | -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | # Build the di-orchestrator binary 2 | 3 | # Use distroless as minimal base image to package the di-orchestrator binary 4 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 5 | FROM redhat/ubi8:latest as di-orchestrator 6 | LABEL maintainer="opendilab.contact@gmail.com" 7 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime 8 | WORKDIR / 9 | COPY ./bin/di-orchestrator . 10 | 11 | ENTRYPOINT ["/di-orchestrator"] 12 | 13 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | domain: opendilab.org 2 | layout: 3 | - go.kubebuilder.io/v3 4 | projectName: di 5 | repo: opendilab.org/di-orchestrator 6 | resources: 7 | - api: 8 | crdVersion: v1 9 | namespaced: true 10 | controller: true 11 | domain: opendilab.org 12 | group: diengine 13 | kind: DIJob 14 | path: opendilab.org/di-orchestrator/api/v2alpha1 15 | version: v2alpha1 16 | webhooks: 17 | defaulting: true 18 | validation: true 19 | webhookVersion: v1 20 | version: "3" 21 | -------------------------------------------------------------------------------- /config/certmanager/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is for teaching kustomize how to update name ref and var substitution 2 | nameReference: 3 | - kind: Issuer 4 | group: cert-manager.io 5 | fieldSpecs: 6 | - kind: Certificate 7 | group: cert-manager.io 8 | path: spec/issuerRef/name 9 | 10 | varReference: 11 | - kind: Certificate 12 | group: cert-manager.io 13 | path: spec/commonName 14 | - kind: Certificate 15 | group: cert-manager.io 16 | path: spec/dnsNames -------------------------------------------------------------------------------- /config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | - role_binding.yaml 4 | - leader_election_role.yaml 5 | - leader_election_role_binding.yaml 6 | # Comment the following 4 lines if you want to disable 7 | # the auth proxy (https://github.com/brancz/kube-rbac-proxy) 8 | # which protects your /metrics endpoint. 9 | - auth_proxy_service.yaml 10 | - auth_proxy_role.yaml 11 | - auth_proxy_role_binding.yaml 12 | - auth_proxy_client_clusterrole.yaml 13 | - di_server_service.yaml 14 | - service_account.yaml -------------------------------------------------------------------------------- /config/default/manager_auth_proxy_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: di-operator 7 | spec: 8 | template: 9 | spec: 10 | containers: 11 | - name: manager 12 | args: 13 | - "--probe-addr=:8080" 14 | - "--metric-addr=:8443" 15 | - "--leader-elect" 16 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: di-leader-election-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | - coordination.k8s.io 10 | resources: 11 | - configmaps 12 | - leases 13 | verbs: 14 | - get 15 | - list 16 | - watch 17 | - create 18 | - update 19 | - patch 20 | - delete 21 | - apiGroups: 22 | - "" 23 | resources: 24 | - events 25 | verbs: 26 | - create 27 | - patch 28 | -------------------------------------------------------------------------------- /pkg/allocator/types/fit_policy.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | // FitPolicy is an implementation of Policy interface. 4 | type FitPolicy struct{} 5 | 6 | func NewFitPolicy() *FitPolicy { 7 | return &FitPolicy{} 8 | } 9 | 10 | func (p FitPolicy) Allocate(job JobInfo, nodes map[string]*NodeInfo) (NodeList, error) { 11 | return NodeList{}, nil 12 | } 13 | 14 | func (p FitPolicy) Optimize(jobs map[string]JobInfo, nodes map[string]*NodeInfo, prevAllocations map[string]NodeList) (map[string]NodeList, error) { 15 | return map[string]NodeList{}, nil 16 | } 17 | -------------------------------------------------------------------------------- /config/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: di-operator 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: manager 10 | args: 11 | - "--config=controller_manager_config.yaml" 12 | volumeMounts: 13 | - name: manager-config 14 | mountPath: /controller_manager_config.yaml 15 | subPath: controller_manager_config.yaml 16 | volumes: 17 | - name: manager-config 18 | configMap: 19 | name: manager-config 20 | -------------------------------------------------------------------------------- /config/crd/patches/webhook_in_dijobs.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: dijobs.diengine.opendilab.org 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | clientConfig: 11 | service: 12 | namespace: $(SERVICE_NAMESPACE) 13 | name: $(SERVICE_NAME) 14 | path: /mutate-diengine-opendilab-org-v1alpha2-dijob 15 | conversionReviewVersions: 16 | - "v1" 17 | - "v1beta1" 18 | -------------------------------------------------------------------------------- /chart/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: {{ .Values.operatorName }}-exporter 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | control-plane: {{ .Values.operatorName }} 8 | spec: 9 | endpoints: 10 | - interval: 60s 11 | port: metrics-port 12 | path: /metrics 13 | scheme: http 14 | jobLabel: control-plane 15 | namespaceSelector: 16 | matchNames: 17 | - {{ .Release.Namespace }} 18 | selector: 19 | matchLabels: 20 | control-plane: {{ .Values.operatorName }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin 9 | 10 | # Test binary, build with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Kubernetes Generated files - skip generated files, except for vendored files 17 | 18 | !vendor/**/zz_generated.* 19 | 20 | # editor and IDE paraphernalia 21 | .idea 22 | *.swp 23 | *.swo 24 | *~ 25 | 26 | *.vscode 27 | 28 | *.coverprofile 29 | coverage.out.* 30 | 31 | config/webhook/manifests.yaml 32 | 33 | hello/* 34 | tmp/* 35 | -------------------------------------------------------------------------------- /pkg/context/context.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "github.com/go-logr/logr" 5 | "k8s.io/client-go/rest" 6 | "k8s.io/client-go/tools/record" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | ) 9 | 10 | type Context struct { 11 | config *rest.Config 12 | Log logr.Logger 13 | client.Client 14 | Recorder record.EventRecorder 15 | } 16 | 17 | func NewContext(config *rest.Config, client client.Client, recorder record.EventRecorder, logger logr.Logger) Context { 18 | return Context{ 19 | config: config, 20 | Client: client, 21 | Recorder: recorder, 22 | Log: logger, 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /pkg/utils/filters.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | ) 6 | 7 | type Filter func(obj interface{}) bool 8 | type Filters []Filter 9 | 10 | func (f Filters) Apply(obj interface{}) bool { 11 | for _, filter := range f { 12 | if !filter(obj) { 13 | return false 14 | } 15 | } 16 | return true 17 | } 18 | 19 | var ( 20 | TerminatingPodFilter = func(obj interface{}) bool { 21 | pod := obj.(*corev1.Pod) 22 | return IsPodTerminating(pod) 23 | } 24 | 25 | NonTerminatingPodFilter = func(obj interface{}) bool { 26 | pod := obj.(*corev1.Pod) 27 | return !IsPodTerminating(pod) 28 | } 29 | ) 30 | -------------------------------------------------------------------------------- /config/default/manager_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: di-webhook 5 | namespace: di-system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: webhook 11 | ports: 12 | - containerPort: 9443 13 | name: webhook-server 14 | protocol: TCP 15 | volumeMounts: 16 | - mountPath: /tmp/k8s-webhook-server/serving-certs 17 | name: cert 18 | readOnly: true 19 | volumes: 20 | - name: cert 21 | secret: 22 | defaultMode: 420 23 | secretName: di-webhook-server-cert 24 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ -------------------------------------------------------------------------------- /e2e/README.md: -------------------------------------------------------------------------------- 1 | # E2E Tests 2 | Through the e2e test, we can test the robustness of DI-engine, ensuring that DIJobs can tolerate common exceptions. 3 | 4 | ## Run 5 | ```bash 6 | go test -timeout 4h -cover -v ./e2e --ginkgo.v --shared-volumes-dir /data/nfs/ding --kubeconfig ~/.kube/config 7 | ``` 8 | - `shared-volumes-dir` represents the shared volumes directory for DI-engine modules (coordinator, collector, etc.) to exchange data and models. Different jobs's shared volumes are placed under this directory. Default `/data/nfs/ding`. 9 | - `kubeconfig` represents path to kubeconfig file to access kubernetes cluster. Default `$HOME/.kube/config`. 10 | - `timeout` can be set according to how long the test will last. 11 | -------------------------------------------------------------------------------- /config/default/webhookcainjection_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch add annotation to admission webhook config and 2 | # the variables $(CERTIFICATE_NAMESPACE) and $(CERTIFICATE_NAME) will be substituted by kustomize. 3 | apiVersion: admissionregistration.k8s.io/v1 4 | kind: MutatingWebhookConfiguration 5 | metadata: 6 | name: di-mutating-webhook-configuration 7 | annotations: 8 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 9 | --- 10 | apiVersion: admissionregistration.k8s.io/v1 11 | kind: ValidatingWebhookConfiguration 12 | metadata: 13 | name: di-validating-webhook-configuration 14 | annotations: 15 | cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) 16 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | - kind: CustomResourceDefinition 21 | group: apiextensions.k8s.io 22 | path: spec/conversion/webhook/clientConfig/service/name 23 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package main 17 | 18 | import "opendilab.org/di-orchestrator/cmd" 19 | 20 | func main() { 21 | cmd.Execute() 22 | } 23 | -------------------------------------------------------------------------------- /chart/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: {{ .Values.operatorName }} 6 | name: {{ .Values.operatorName }}-metrics-service 7 | namespace: {{ .Release.Namespace }} 8 | spec: 9 | ports: 10 | - name: metrics-port 11 | port: 8443 12 | targetPort: 8443 13 | selector: 14 | control-plane: {{ .Values.operatorName }} 15 | --- 16 | apiVersion: v1 17 | kind: Service 18 | metadata: 19 | labels: 20 | control-plane: {{ .Values.serverName }} 21 | name: {{ .Values.serverName }} 22 | namespace: {{ .Release.Namespace }} 23 | spec: 24 | ports: 25 | - port: {{ .Values.serverPort }} 26 | protocol: TCP 27 | targetPort: {{ .Values.serverPort }} 28 | selector: 29 | control-plane: {{ .Values.serverName }} -------------------------------------------------------------------------------- /pkg/allocator/types/infos.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | apitypes "k8s.io/apimachinery/pkg/types" 6 | ) 7 | 8 | type JobInfo struct { 9 | Key apitypes.NamespacedName 10 | Resources corev1.ResourceRequirements 11 | MinReplicas int 12 | MaxReplicas int 13 | Preemptible bool 14 | } 15 | 16 | func NewJobInfo(key apitypes.NamespacedName, r corev1.ResourceRequirements, minr int, maxr int, preemptible bool) *JobInfo { 17 | return &JobInfo{ 18 | Key: key, 19 | Resources: r, 20 | MinReplicas: minr, 21 | MaxReplicas: maxr, 22 | Preemptible: preemptible, 23 | } 24 | } 25 | 26 | type NodeInfo struct { 27 | Key string 28 | // Resources is the list of the free resources on the node. 29 | Resources corev1.ResourceList 30 | } 31 | 32 | func NewNodeInfo(key string, r corev1.ResourceList) *NodeInfo { 33 | return &NodeInfo{Key: key, Resources: r} 34 | } 35 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRole 5 | metadata: 6 | creationTimestamp: null 7 | name: di-operator-cluster-role 8 | rules: 9 | - apiGroups: 10 | - "" 11 | resources: 12 | - events 13 | - pods 14 | - services 15 | verbs: 16 | - create 17 | - delete 18 | - get 19 | - list 20 | - patch 21 | - update 22 | - watch 23 | - apiGroups: 24 | - "" 25 | resources: 26 | - namespaces 27 | - nodes 28 | verbs: 29 | - get 30 | - list 31 | - watch 32 | - apiGroups: 33 | - diengine.opendilab.org 34 | resources: 35 | - dijobs 36 | verbs: 37 | - create 38 | - delete 39 | - get 40 | - list 41 | - patch 42 | - update 43 | - watch 44 | - apiGroups: 45 | - diengine.opendilab.org 46 | resources: 47 | - dijobs/finalizers 48 | verbs: 49 | - update 50 | - apiGroups: 51 | - diengine.opendilab.org 52 | resources: 53 | - dijobs/status 54 | verbs: 55 | - get 56 | - patch 57 | - update 58 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/diengine.opendilab.org_dijobs.yaml 6 | #+kubebuilder:scaffold:crdkustomizeresource 7 | 8 | patchesStrategicMerge: 9 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 10 | # patches here are for enabling the conversion webhook for each CRD 11 | # - patches/webhook_in_dijobs.yaml 12 | #+kubebuilder:scaffold:crdkustomizewebhookpatch 13 | 14 | # [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix. 15 | # patches here are for enabling the CA injection for each CRD 16 | # - patches/cainjection_in_dijobs.yaml 17 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch 18 | 19 | # the following config is for teaching kustomize how to do kustomization for CRDs. 20 | configurations: 21 | - kustomizeconfig.yaml 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the di-orchestrator binary 2 | FROM golang:1.16 as builder 3 | 4 | WORKDIR /workspace 5 | # Copy the Go Modules manifests 6 | COPY go.mod go.mod 7 | COPY go.sum go.sum 8 | # cache deps before building and copying source so that we don't need to re-download as much 9 | # and so that source changes don't invalidate our downloaded layer 10 | RUN go mod download 11 | 12 | # Copy the go source 13 | COPY cmd/ cmd/ 14 | COPY pkg/ pkg/ 15 | COPY main.go main.go 16 | 17 | # Build orchestrator 18 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -a -o di-orchestrator ./main.go 19 | 20 | # Use distroless as minimal base image to package the di-orchestrator binary 21 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 22 | FROM redhat/ubi8:latest 23 | LABEL maintainer="opendilab.contact@gmail.com" 24 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime 25 | WORKDIR / 26 | COPY --from=builder /workspace/di-orchestrator . 27 | 28 | ENTRYPOINT ["/di-orchestrator"] 29 | -------------------------------------------------------------------------------- /chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: di-orchestrator 3 | description: Kubernetes Custom Resource and Operator for DI-engine jobs 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | version: 1.1.3 18 | 19 | # This is the version number of the application being deployed. This version number should be 20 | # incremented each time you make changes to the application. 21 | appVersion: 0.1.0 22 | -------------------------------------------------------------------------------- /config/certmanager/certificate.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. 4 | apiVersion: cert-manager.io/v1 5 | kind: Issuer 6 | metadata: 7 | name: di-selfsigned-issuer 8 | namespace: di-system 9 | spec: 10 | selfSigned: {} 11 | --- 12 | apiVersion: cert-manager.io/v1 13 | kind: Certificate 14 | metadata: 15 | name: di-serving-cert # this name should match the one appeared in kustomizeconfig.yaml 16 | namespace: di-system 17 | spec: 18 | # $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize 19 | dnsNames: 20 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc 21 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local 22 | issuerRef: 23 | kind: Issuer 24 | name: di-selfsigned-issuer 25 | secretName: di-webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize -------------------------------------------------------------------------------- /config/samples/dijob-serial.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: serial-test 5 | # generateName: serial-test- 6 | spec: 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 重启次数,可以为nil,表示无限重启;默认为3 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作 11 | volumes: 12 | - name: cache-volume 13 | emptyDir: 14 | medium: Memory 15 | sizeLimit: 128Mi 16 | tasks: 17 | - replicas: 1 18 | name: serial 19 | type: none 20 | template: 21 | spec: 22 | containers: 23 | - name: di-container 24 | image: opendilab/ding:nightly 25 | imagePullPolicy: Always 26 | env: 27 | - name: PYTHONUNBUFFERED 28 | value: "1" 29 | command: ["/bin/bash", "-c",] 30 | args: 31 | - | 32 | ding -m serial -c dizoo/classic_control/cartpole/config/cartpole_dqn_config.py -s 0 33 | volumeMounts: 34 | - name: cache-volume 35 | mountPath: /dev/shm 36 | 37 | -------------------------------------------------------------------------------- /hack/update-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu -o pipefail 3 | 4 | version=$1 5 | app_version=$2 6 | if [[ "$version" =~ ^v ]]; then 7 | chart_version=${version:1} 8 | else 9 | chart_version="$version" 10 | fi 11 | 12 | # update chart version 13 | for f in "chart/Chart.yaml"; do 14 | echo "update chart version to ${chart_version}" 15 | sed -r "s|^(\s*)version:(\s*)(.*)|\1version: ${chart_version}|" "$f" >.tmp 16 | mv .tmp "$f" 17 | done 18 | 19 | # update chart app version 20 | for f in "chart/Chart.yaml"; do 21 | echo "update chart app version to ${app_version}" 22 | sed -r "s|^(\s*)appVersion:(\s*)(.*)|\1appVersion: ${app_version}|" "$f" >.tmp 23 | mv .tmp "$f" 24 | done 25 | 26 | # update chart value tag 27 | for f in "chart/values.yaml"; do 28 | echo "update chart value tag to ${version}" 29 | sed -r "s|^(\s*)tag:(\s*)(.*)|\1tag: ${version}|" "$f" >.tmp 30 | mv .tmp "$f" 31 | done 32 | 33 | for f in ".github/workflows/release.yaml"; do 34 | echo "update github action version to ${version}" 35 | sed -r "s|^(\s*)version:(\s*)(.*)|\1version: ${version}|" "$f" >.tmp 36 | mv .tmp "$f" 37 | done 38 | -------------------------------------------------------------------------------- /config/webhook/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # the following config is for teaching kustomize where to look at when substituting vars. 2 | # It requires kustomize v2.1.0 or newer to work properly. 3 | nameReference: 4 | - kind: Service 5 | version: v1 6 | fieldSpecs: 7 | - kind: MutatingWebhookConfiguration 8 | group: admissionregistration.k8s.io 9 | path: webhooks/clientConfig/service/name 10 | - kind: ValidatingWebhookConfiguration 11 | group: admissionregistration.k8s.io 12 | path: webhooks/clientConfig/service/name 13 | 14 | namespace: 15 | - kind: MutatingWebhookConfiguration 16 | group: admissionregistration.k8s.io 17 | path: webhooks/clientConfig/service/namespace 18 | create: true 19 | - kind: ValidatingWebhookConfiguration 20 | group: admissionregistration.k8s.io 21 | path: webhooks/clientConfig/service/namespace 22 | create: true 23 | 24 | varReference: 25 | - path: metadata/annotations 26 | - kind: MutatingWebhookConfiguration 27 | group: admissionregistration.k8s.io 28 | path: webhooks/clientConfig/service/name 29 | - kind: ValidatingWebhookConfiguration 30 | group: admissionregistration.k8s.io 31 | path: webhooks/clientConfig/service/name 32 | -------------------------------------------------------------------------------- /hack/update-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 22 | MODULE="opendilab.org/di-orchestrator" 23 | 24 | deepcopy-gen --bounding-dirs "${MODULE}/pkg/common/types" \ 25 | --go-header-file "${SCRIPT_ROOT}/hack/boilerplate.go.txt" \ 26 | --input-dirs "${MODULE}/pkg/common/types" \ 27 | --output-base "${SCRIPT_ROOT}" \ 28 | --output-file-base "zz_generated.deepcopy" 29 | 30 | mv ${MODULE}/pkg/common/types/* ${SCRIPT_ROOT}/pkg/common/types/ 31 | rm -rf $(dirname ${MODULE}) 32 | -------------------------------------------------------------------------------- /pkg/server/server.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/gin-gonic/gin" 7 | 8 | dicontext "opendilab.org/di-orchestrator/pkg/context" 9 | ) 10 | 11 | var ( 12 | apiVersion = "v2alpha1" 13 | ) 14 | 15 | type DIServer struct { 16 | ctx dicontext.Context 17 | p ProcessorInterface 18 | serverBindAddress string 19 | } 20 | 21 | func NewDIServer( 22 | ctx dicontext.Context, 23 | processor ProcessorInterface, 24 | serverBindAddress string) *DIServer { 25 | return &DIServer{ 26 | ctx: ctx, 27 | p: processor, 28 | serverBindAddress: serverBindAddress, 29 | } 30 | } 31 | 32 | func (s *DIServer) Start(ctx context.Context) error { 33 | log := s.ctx.Log.WithName("DIServer") 34 | r := gin.Default() 35 | v2alpha1 := r.Group(apiVersion) 36 | { 37 | v2alpha1.GET("job/:id/replicas", s.getReplicas) 38 | v2alpha1.POST("job/:id/replicas", s.addReplicas) 39 | v2alpha1.DELETE("job/:id/replicas", s.deleteReplicas) 40 | v2alpha1.POST("job/:id/profilings", s.profilings) 41 | } 42 | 43 | log.Info("Start listening on", "port", s.serverBindAddress) 44 | if err := r.Run(s.serverBindAddress); err != nil { 45 | return err 46 | } 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /pkg/common/event_handler.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "k8s.io/client-go/util/workqueue" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | "sigs.k8s.io/controller-runtime/pkg/event" 7 | ) 8 | 9 | type EventHandler struct { 10 | OnCreateHandlers []func(obj client.Object) 11 | OnUpdateHandlers []func(old client.Object, new client.Object) 12 | OnDeleteHandlers []func(obj client.Object) 13 | } 14 | 15 | // Create implements EventHandler 16 | func (e *EventHandler) Create(evt event.CreateEvent, q workqueue.RateLimitingInterface) { 17 | for _, handler := range e.OnCreateHandlers { 18 | handler(evt.Object) 19 | } 20 | } 21 | 22 | // Update implements EventHandler 23 | func (e *EventHandler) Update(evt event.UpdateEvent, q workqueue.RateLimitingInterface) { 24 | for _, handler := range e.OnUpdateHandlers { 25 | handler(evt.ObjectOld, evt.ObjectNew) 26 | } 27 | } 28 | 29 | // Delete implements EventHandler 30 | func (e *EventHandler) Delete(evt event.DeleteEvent, q workqueue.RateLimitingInterface) { 31 | for _, handler := range e.OnDeleteHandlers { 32 | handler(evt.Object) 33 | } 34 | } 35 | 36 | // Generic implements EventHandler 37 | func (e *EventHandler) Generic(evt event.GenericEvent, q workqueue.RateLimitingInterface) { 38 | } 39 | -------------------------------------------------------------------------------- /pkg/common/handler/event_handler.go: -------------------------------------------------------------------------------- 1 | package handler 2 | 3 | import ( 4 | "k8s.io/client-go/util/workqueue" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | "sigs.k8s.io/controller-runtime/pkg/event" 7 | ) 8 | 9 | type EventHandler struct { 10 | OnCreateHandlers []func(obj client.Object) 11 | OnUpdateHandlers []func(old client.Object, new client.Object) 12 | OnDeleteHandlers []func(obj client.Object) 13 | } 14 | 15 | // Create implements EventHandler 16 | func (e *EventHandler) Create(evt event.CreateEvent, q workqueue.RateLimitingInterface) { 17 | for _, handler := range e.OnCreateHandlers { 18 | handler(evt.Object) 19 | } 20 | } 21 | 22 | // Update implements EventHandler 23 | func (e *EventHandler) Update(evt event.UpdateEvent, q workqueue.RateLimitingInterface) { 24 | for _, handler := range e.OnUpdateHandlers { 25 | handler(evt.ObjectOld, evt.ObjectNew) 26 | } 27 | } 28 | 29 | // Delete implements EventHandler 30 | func (e *EventHandler) Delete(evt event.DeleteEvent, q workqueue.RateLimitingInterface) { 31 | for _, handler := range e.OnDeleteHandlers { 32 | handler(evt.Object) 33 | } 34 | } 35 | 36 | // Generic implements EventHandler 37 | func (e *EventHandler) Generic(evt event.GenericEvent, q workqueue.RateLimitingInterface) { 38 | } 39 | -------------------------------------------------------------------------------- /config/manager/di_server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: di-server 5 | labels: 6 | control-plane: di-server 7 | spec: 8 | selector: 9 | matchLabels: 10 | control-plane: di-server 11 | replicas: 1 12 | template: 13 | metadata: 14 | labels: 15 | control-plane: di-server 16 | spec: 17 | serviceAccount: di-orchestrator 18 | containers: 19 | - command: 20 | - /di-orchestrator 21 | - server 22 | args: 23 | - "--zap-devel=true" 24 | - "--server-bind-address=:8081" 25 | - "--qps=100" 26 | - "--burst=200" 27 | image: opendilab/di-orchestrator:v1.1.3 28 | imagePullPolicy: Always 29 | name: server 30 | envFrom: 31 | - configMapRef: 32 | name: di-config 33 | securityContext: 34 | allowPrivilegeEscalation: false 35 | livenessProbe: 36 | httpGet: 37 | path: /healthz 38 | port: 8080 39 | initialDelaySeconds: 15 40 | periodSeconds: 20 41 | resources: 42 | limits: 43 | cpu: 100m 44 | memory: 500Mi 45 | requests: 46 | cpu: 100m 47 | memory: 500Mi 48 | terminationGracePeriodSeconds: 10 49 | -------------------------------------------------------------------------------- /config/webhook/manifests_stable.yaml: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | apiVersion: admissionregistration.k8s.io/v1 4 | kind: MutatingWebhookConfiguration 5 | metadata: 6 | name: di-mutating-webhook-configuration 7 | webhooks: 8 | - admissionReviewVersions: 9 | - v1 10 | - v1beta1 11 | clientConfig: 12 | service: 13 | name: di-webhook-service 14 | namespace: di-system 15 | path: /mutate-diengine-opendilab-org-v2alpha1-dijob 16 | failurePolicy: Fail 17 | name: mdijob.kb.io 18 | rules: 19 | - apiGroups: 20 | - diengine.opendilab.org 21 | apiVersions: 22 | - v2alpha1 23 | operations: 24 | - CREATE 25 | - UPDATE 26 | resources: 27 | - dijobs 28 | sideEffects: None 29 | 30 | --- 31 | apiVersion: admissionregistration.k8s.io/v1 32 | kind: ValidatingWebhookConfiguration 33 | metadata: 34 | name: di-validating-webhook-configuration 35 | webhooks: 36 | - admissionReviewVersions: 37 | - v1 38 | - v1beta1 39 | clientConfig: 40 | service: 41 | name: di-webhook-service 42 | namespace: di-system 43 | path: /validate-diengine-opendilab-org-v2alpha1-dijob 44 | failurePolicy: Fail 45 | name: vdijob.kb.io 46 | rules: 47 | - apiGroups: 48 | - diengine.opendilab.org 49 | apiVersions: 50 | - v2alpha1 51 | operations: 52 | - CREATE 53 | - UPDATE 54 | resources: 55 | - dijobs 56 | sideEffects: None 57 | -------------------------------------------------------------------------------- /config/manager/di_webhook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: di-webhook 5 | labels: 6 | control-plane: di-webhook 7 | spec: 8 | selector: 9 | matchLabels: 10 | control-plane: di-webhook 11 | replicas: 1 12 | template: 13 | metadata: 14 | labels: 15 | control-plane: di-webhook 16 | spec: 17 | containers: 18 | - command: 19 | - /di-orchestrator 20 | - webhook 21 | args: 22 | - "--probe-addr=:8080" 23 | - "--metric-addr=:8443" 24 | - "--port=9443" 25 | - "--qps=100" 26 | - "--burst=200" 27 | image: opendilab/di-orchestrator:v1.1.3 28 | imagePullPolicy: Always 29 | name: webhook 30 | securityContext: 31 | allowPrivilegeEscalation: false 32 | livenessProbe: 33 | httpGet: 34 | path: /healthz 35 | port: 8080 36 | initialDelaySeconds: 15 37 | periodSeconds: 20 38 | readinessProbe: 39 | httpGet: 40 | path: /readyz 41 | port: 8080 42 | initialDelaySeconds: 5 43 | periodSeconds: 10 44 | resources: 45 | limits: 46 | cpu: 30m 47 | memory: 100Mi 48 | requests: 49 | cpu: 30m 50 | memory: 100Mi 51 | terminationGracePeriodSeconds: 10 52 | -------------------------------------------------------------------------------- /config/crd/minimal/crds.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | annotations: 5 | controller-gen.kubebuilder.io/version: v0.4.1 6 | creationTimestamp: null 7 | name: dijobs.diengine.opendilab.org 8 | spec: 9 | group: diengine.opendilab.org 10 | names: 11 | kind: DIJob 12 | listKind: DIJobList 13 | plural: dijobs 14 | shortNames: 15 | - dijob 16 | singular: dijob 17 | scope: Namespaced 18 | versions: 19 | - additionalPrinterColumns: 20 | - jsonPath: .status.phase 21 | name: Phase 22 | type: string 23 | - jsonPath: .metadata.creationTimestamp 24 | name: Age 25 | type: date 26 | name: v2alpha1 27 | schema: 28 | openAPIV3Schema: 29 | description: DIJob is the Schema for the dijobs API 30 | properties: 31 | apiVersion: 32 | type: string 33 | kind: 34 | type: string 35 | metadata: 36 | type: object 37 | spec: 38 | type: object 39 | x-kubernetes-map-type: atomic 40 | x-kubernetes-preserve-unknown-fields: true 41 | status: 42 | type: object 43 | x-kubernetes-map-type: atomic 44 | x-kubernetes-preserve-unknown-fields: true 45 | type: object 46 | served: true 47 | storage: true 48 | subresources: 49 | status: {} 50 | -------------------------------------------------------------------------------- /pkg/api/v2alpha1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v2alpha1 contains API Schema definitions for the v2alpha1 API group 18 | //+kubebuilder:object:generate=true 19 | //+groupName=diengine.opendilab.org 20 | package v2alpha1 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // KindDIJob is kind of DIJob 29 | KindDIJob = "DIJob" 30 | 31 | // GroupVersion is group version used to register these objects 32 | GroupVersion = schema.GroupVersion{Group: "diengine.opendilab.org", Version: "v2alpha1"} 33 | 34 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 35 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 36 | 37 | // AddToScheme adds the types in this group-version to the given scheme. 38 | AddToScheme = SchemeBuilder.AddToScheme 39 | ) 40 | -------------------------------------------------------------------------------- /test/e2e/jobs/normal-job-sleep.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: normal-job-sleep 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | name: "learner" 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | sleep 15 24 | restartPolicy: Never 25 | - replicas: 1 26 | name: "evaluator" 27 | type: evaluator 28 | template: 29 | spec: 30 | containers: 31 | - name: di-container 32 | image: alpine:3.9 33 | command: ["/bin/sh", "-c",] 34 | args: 35 | - | 36 | sleep 15 37 | restartPolicy: Never 38 | - replicas: 2 39 | name: "collector" 40 | type: collector 41 | template: 42 | spec: 43 | containers: 44 | - name: di-container 45 | image: alpine:3.9 46 | command: ["/bin/sh", "-c",] 47 | args: 48 | - | 49 | sleep 15 50 | restartPolicy: Never -------------------------------------------------------------------------------- /pkg/utils/validator.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | 6 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 7 | ) 8 | 9 | type Validator func(job *div2alpha1.DIJob) error 10 | type Validators []Validator 11 | 12 | func (f Validators) Apply(job *div2alpha1.DIJob) error { 13 | for _, filter := range f { 14 | if err := filter(job); err != nil { 15 | return err 16 | } 17 | } 18 | return nil 19 | } 20 | 21 | var ( 22 | TaskTypeNameValidator = func(job *div2alpha1.DIJob) error { 23 | taskTypeNumber := map[div2alpha1.TaskType]int{} //record taskType and its number, (learner, collector, evaluator)'s number must be one 24 | taskNames := map[string]int{} //record taskName and its number, (learner, collector, evaluator, none)'s name must be unique(number is one). 25 | for _, task := range job.Spec.Tasks { 26 | taskTypeNumber[task.Type]++ 27 | if taskTypeNumber[task.Type] > 1 && task.Type != div2alpha1.TaskTypeNone { // has more than one typed task 28 | return fmt.Errorf("the number of %s task is more than one", task.Type) 29 | } 30 | if task.Type == div2alpha1.TaskTypeNone && task.Name == "" { // none type task must has a name 31 | return fmt.Errorf("none type task has no name") 32 | } 33 | taskNames[task.Name]++ 34 | } 35 | for name, number := range taskNames { // check every name is unique 36 | if number > 1 { 37 | return fmt.Errorf("there is more than one task named %s", name) 38 | } 39 | } 40 | return nil 41 | } 42 | ) 43 | -------------------------------------------------------------------------------- /config/manager/di_operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: di-operator 5 | labels: 6 | control-plane: di-operator 7 | spec: 8 | selector: 9 | matchLabels: 10 | control-plane: di-operator 11 | replicas: 1 12 | template: 13 | metadata: 14 | labels: 15 | control-plane: di-operator 16 | spec: 17 | serviceAccount: di-orchestrator 18 | containers: 19 | - command: 20 | - /di-orchestrator 21 | - operator 22 | args: 23 | - "--zap-devel=true" 24 | - "--probe-addr=:8080" 25 | - "--metric-addr=:8443" 26 | - "--leader-elect" 27 | - "--qps=100" 28 | - "--burst=200" 29 | image: opendilab/di-orchestrator:v1.1.3 30 | imagePullPolicy: Always 31 | name: manager 32 | envFrom: 33 | - configMapRef: 34 | name: di-config 35 | securityContext: 36 | allowPrivilegeEscalation: false 37 | livenessProbe: 38 | httpGet: 39 | path: /healthz 40 | port: 8080 41 | initialDelaySeconds: 15 42 | periodSeconds: 20 43 | readinessProbe: 44 | httpGet: 45 | path: /readyz 46 | port: 8080 47 | initialDelaySeconds: 5 48 | periodSeconds: 10 49 | resources: 50 | limits: 51 | cpu: 100m 52 | memory: 500Mi 53 | requests: 54 | cpu: 100m 55 | memory: 500Mi 56 | terminationGracePeriodSeconds: 10 57 | -------------------------------------------------------------------------------- /test/e2e/jobs/normal-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: normal-job 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | name: "learner" 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | echo $DI_NODES 24 | echo $DI_LEARNER_NODES 25 | restartPolicy: Never 26 | - replicas: 1 27 | name: "evaluator" 28 | type: evaluator 29 | template: 30 | spec: 31 | containers: 32 | - name: di-container 33 | image: alpine:3.9 34 | command: ["/bin/sh", "-c",] 35 | args: 36 | - | 37 | echo $DI_NODES 38 | echo $DI_EVALUATOR_NODES 39 | restartPolicy: Never 40 | - replicas: 2 41 | name: "collector" 42 | type: collector 43 | template: 44 | spec: 45 | containers: 46 | - name: di-container 47 | image: alpine:3.9 48 | command: ["/bin/sh", "-c",] 49 | args: 50 | - | 51 | echo $DI_NODES 52 | echo $DI_COLLECTOR_NODES 53 | restartPolicy: Never -------------------------------------------------------------------------------- /test/e2e/jobs/name-validate-task-repeat.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: validate-task-repeat 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | name: "learner" # learner task without name 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | echo $DI_NODES 24 | echo $DI_LEARNER_NODES 25 | restartPolicy: Never 26 | - replicas: 1 27 | name: "evaluator" 28 | type: collector 29 | template: 30 | spec: 31 | containers: 32 | - name: di-container 33 | image: alpine:3.9 34 | command: ["/bin/sh", "-c",] 35 | args: 36 | - | 37 | echo $DI_NODES 38 | echo $DI_EVALUATOR_NODES 39 | restartPolicy: Never 40 | - replicas: 2 41 | name: "collector" 42 | type: collector 43 | template: 44 | spec: 45 | containers: 46 | - name: di-container 47 | image: alpine:3.9 48 | command: ["/bin/sh", "-c",] 49 | args: 50 | - | 51 | echo $DI_NODES 52 | echo $DI_COLLECTOR_NODES 53 | restartPolicy: Never -------------------------------------------------------------------------------- /test/e2e/jobs/name-validate-without-name.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: validate-task-without-name 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | # name: "learner" # learner task without name 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | echo $DI_NODES 24 | echo $DI_LEARNER_NODES 25 | restartPolicy: Never 26 | - replicas: 1 27 | # name: "evaluator" 28 | type: evaluator 29 | template: 30 | spec: 31 | containers: 32 | - name: di-container 33 | image: alpine:3.9 34 | command: ["/bin/sh", "-c",] 35 | args: 36 | - | 37 | echo $DI_NODES 38 | echo $DI_EVALUATOR_NODES 39 | restartPolicy: Never 40 | - replicas: 2 41 | name: "collector" 42 | type: collector 43 | template: 44 | spec: 45 | containers: 46 | - name: di-container 47 | image: alpine:3.9 48 | command: ["/bin/sh", "-c",] 49 | args: 50 | - | 51 | echo $DI_NODES 52 | echo $DI_COLLECTOR_NODES 53 | restartPolicy: Never -------------------------------------------------------------------------------- /pkg/context/job.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | "k8s.io/apimachinery/pkg/util/wait" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | 11 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 12 | ) 13 | 14 | func (c *Context) CleanUpJob(ctx context.Context, job *div2alpha1.DIJob) error { 15 | err := c.Delete(ctx, job, &client.DeleteOptions{}) 16 | if err != nil { 17 | return err 18 | } 19 | time.Sleep(250 * time.Millisecond) 20 | 21 | pods, err := c.ListJobPods(ctx, job) 22 | if err != nil { 23 | return err 24 | } 25 | for _, pod := range pods { 26 | err = c.Delete(ctx, pod, &client.DeleteOptions{GracePeriodSeconds: func(a int64) *int64 { return &a }(0)}) 27 | if err != nil { 28 | return err 29 | } 30 | } 31 | 32 | svcs, err := c.ListJobServices(ctx, job) 33 | if err != nil { 34 | return err 35 | } 36 | for _, svc := range svcs { 37 | err = c.Delete(ctx, svc, &client.DeleteOptions{}) 38 | if err != nil { 39 | return err 40 | } 41 | } 42 | return nil 43 | } 44 | 45 | func (c *Context) WaitForAllReplicas(ctx context.Context, job *div2alpha1.DIJob, phase corev1.PodPhase) error { 46 | if err := wait.Poll(100*time.Millisecond, 5*time.Minute, func() (bool, error) { 47 | pods, err := c.ListJobPods(ctx, job) 48 | if err != nil { 49 | return false, err 50 | } 51 | // if there are only coordinator, keep waiting 52 | if len(pods) <= 1 { 53 | return false, nil 54 | } 55 | for _, pod := range pods { 56 | if pod.Status.Phase != phase { 57 | return false, nil 58 | } 59 | } 60 | return true, nil 61 | }); err != nil { 62 | return err 63 | } 64 | 65 | return nil 66 | } 67 | -------------------------------------------------------------------------------- /test/e2e/jobs/name-validate-name-repeat.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: validate-name-repeat 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | # name: "learner" # learner task without name 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | echo $DI_NODES 24 | echo $DI_LEARNER_NODES 25 | restartPolicy: Never 26 | - replicas: 1 27 | name: "name-test" 28 | type: evaluator 29 | template: 30 | spec: 31 | containers: 32 | - name: di-container 33 | image: alpine:3.9 34 | command: ["/bin/sh", "-c",] 35 | args: 36 | - | 37 | echo $DI_NODES 38 | echo $DI_EVALUATOR_NODES 39 | restartPolicy: Never 40 | - replicas: 2 41 | name: "name-test" # same name with evaluator task 42 | type: collector 43 | template: 44 | spec: 45 | containers: 46 | - name: di-container 47 | image: alpine:3.9 48 | command: ["/bin/sh", "-c",] 49 | args: 50 | - | 51 | echo $DI_NODES 52 | echo $DI_COLLECTOR_NODES 53 | restartPolicy: Never -------------------------------------------------------------------------------- /cmd/common/common.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package common 17 | 18 | import ( 19 | goflag "flag" 20 | 21 | "github.com/spf13/cobra" 22 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 23 | ) 24 | 25 | type GenericFlags struct { 26 | QPS float64 27 | Burst int 28 | ServiceDomainName string 29 | DIServerURL string 30 | ZapOpts *zap.Options 31 | } 32 | 33 | func NewGenericFlags() *GenericFlags { 34 | return &GenericFlags{ 35 | QPS: 5, 36 | Burst: 10, 37 | ServiceDomainName: "svc.cluster.local", 38 | DIServerURL: "http://di-server.di-system.svc.cluster.local:8081", 39 | ZapOpts: &zap.Options{}, 40 | } 41 | } 42 | 43 | func (f *GenericFlags) AddFlags(cmd *cobra.Command) { 44 | goflag.Float64Var(&f.QPS, "qps", f.QPS, "qps for k8s client") 45 | goflag.IntVar(&f.Burst, "burst", f.Burst, "burst for k8s client") 46 | goflag.StringVar(&f.ServiceDomainName, "service-domain-name", f.ServiceDomainName, "k8s service domain name") 47 | goflag.StringVar(&f.DIServerURL, "di-server-url", f.DIServerURL, "url for accessing di server") 48 | f.ZapOpts.BindFlags(goflag.CommandLine) 49 | } 50 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | # default concurrency is a available CPU number 3 | concurrency: 4 4 | # timeout for analysis, e.g. 30s, 5m, default is 1m 5 | deadline: 10m 6 | # exit code when at least one issue was found, default is 1 7 | issues-exit-code: 1 8 | 9 | # include test files or not, default is true 10 | tests: true 11 | 12 | skip-dirs: 13 | - manifests 14 | - third_party # from go-ethereum 15 | - _out 16 | - doc # user tutorial 17 | - deployment 18 | - config # the crd config yaml 19 | - cluster # the logging bash 20 | - vendor # the third library 21 | - api # auto-generated 22 | - pkg/client # auto-generated 23 | - example 24 | - bin 25 | # output configuration options 26 | output: 27 | # colored-line-number|line-number|json|tab|checkstyle|code-climate, default is "colored-line-number" 28 | format: colored-line-number 29 | 30 | # print lines of code with issue, default is true 31 | print-issued-lines: true 32 | 33 | # print linter name in the end of issue text, default is true 34 | print-linter-name: true 35 | 36 | linters: 37 | fast: true 38 | enable: 39 | - gofmt 40 | - goimports 41 | - golint 42 | - deadcode 43 | disable: 44 | - gocyclo 45 | - typecheck 46 | - bodyclose 47 | - gochecknoinits 48 | - gochecknoglobals 49 | - gocyclo 50 | - lll 51 | - maligned 52 | - unparam 53 | - unused 54 | - depguard 55 | - dupl 56 | - errcheck 57 | - gas 58 | - goconst 59 | - gocritic 60 | - gosec 61 | - gosimple 62 | - govet 63 | - interfacer 64 | - ineffassign 65 | - megacheck 66 | - misspell 67 | - nakedret 68 | - prealloc 69 | - staticcheck 70 | - structcheck 71 | - stylecheck 72 | - typecheck 73 | - unconvert 74 | - varcheck 75 | -------------------------------------------------------------------------------- /pkg/allocator/node.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | import ( 4 | "context" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | 9 | alloctypes "opendilab.org/di-orchestrator/pkg/allocator/types" 10 | ) 11 | 12 | func (a *Allocator) getNodeInfos(ctx context.Context, nodes []*corev1.Node) (map[string]*alloctypes.NodeInfo, error) { 13 | nodeInfos := make(map[string]*alloctypes.NodeInfo) 14 | // fieldSelector, err := fields.ParseSelector("spec.nodeName=" + name + ",status.phase!=" + string(corev1.PodSucceeded) + ",status.phase!=" + string(corev1.PodFailed)) 15 | // fieldSelector := fields.SelectorFromSet(fields.Set{"spec.nodeName": name}) 16 | // pods, err := c.ListPods(&client.ListOptions{FieldSelector: fieldSelector}) 17 | pods, err := a.ctx.ListPods(ctx, &client.ListOptions{}) 18 | if err != nil { 19 | return nil, err 20 | } 21 | 22 | for _, node := range nodes { 23 | nodePods := make([]*corev1.Pod, 0) 24 | for _, pod := range pods { 25 | if pod.Spec.NodeName == node.Name { 26 | nodePods = append(nodePods, pod) 27 | } 28 | } 29 | nodeInfo, err := a.getNodeInfo(node, nodePods) 30 | if err != nil { 31 | return nil, err 32 | } 33 | nodeInfos[node.Name] = nodeInfo 34 | } 35 | 36 | return nodeInfos, nil 37 | } 38 | 39 | func (a *Allocator) getNodeInfo(node *corev1.Node, pods []*corev1.Pod) (*alloctypes.NodeInfo, error) { 40 | reqs, _, err := a.ctx.GetNodeAllocatedResources(node, pods) 41 | if err != nil { 42 | return nil, err 43 | } 44 | 45 | allocatable := node.Status.Allocatable 46 | free := corev1.ResourceList{} 47 | for resourceName, value := range allocatable { 48 | alloc := value.DeepCopy() 49 | alloc.Sub(reqs[resourceName]) 50 | free[resourceName] = alloc 51 | } 52 | return &alloctypes.NodeInfo{ 53 | Key: node.Name, 54 | Resources: free, 55 | }, nil 56 | } 57 | -------------------------------------------------------------------------------- /pkg/server/types/error.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import ( 4 | "errors" 5 | ) 6 | 7 | type DIError struct { 8 | Type ErrorType `json:"type"` 9 | Message error `json:"message"` 10 | } 11 | 12 | func (n *DIError) Error() string { 13 | return n.Message.Error() 14 | } 15 | 16 | type ErrorType string 17 | 18 | const ( 19 | // StatusCode = 500 20 | ErrorUnknown ErrorType = "Unknown" 21 | 22 | // StatusCode = 404 23 | ErrorNotFound ErrorType = "NotFound" 24 | 25 | // StatusCode = 409 26 | ErrorAlreadyExists ErrorType = "AlreadyExists" 27 | 28 | // StatusCode = 400 29 | ErrorBadRequest ErrorType = "BadRequest" 30 | 31 | // StatusCode = 501 32 | ErrorNotImplemented ErrorType = "NotImplemented" 33 | ) 34 | 35 | func NewNotFoundError(msg error) *DIError { 36 | return &DIError{ 37 | Type: ErrorNotFound, 38 | Message: msg, 39 | } 40 | } 41 | 42 | func NewAlreadyExistsError(msg error) *DIError { 43 | return &DIError{ 44 | Type: ErrorAlreadyExists, 45 | Message: msg, 46 | } 47 | } 48 | 49 | func NewBadRequestError(msg error) *DIError { 50 | return &DIError{ 51 | Type: ErrorBadRequest, 52 | Message: msg, 53 | } 54 | } 55 | 56 | func NewNotImplementedError(msg error) *DIError { 57 | return &DIError{ 58 | Type: ErrorNotImplemented, 59 | Message: msg, 60 | } 61 | } 62 | 63 | func IsNotFound(err error) bool { 64 | return TypeForError(err) == ErrorNotFound 65 | } 66 | 67 | func IsAlreadyExists(err error) bool { 68 | return TypeForError(err) == ErrorAlreadyExists 69 | } 70 | 71 | func IsBadRequest(err error) bool { 72 | return TypeForError(err) == ErrorBadRequest 73 | } 74 | 75 | func IsNotImplemented(err error) bool { 76 | return TypeForError(err) == ErrorNotImplemented 77 | } 78 | 79 | func TypeForError(err error) ErrorType { 80 | var diErr *DIError 81 | if errors.As(err, &diErr) { 82 | return diErr.Type 83 | } 84 | return ErrorUnknown 85 | } 86 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Go 14 | uses: actions/setup-go@v2 15 | with: 16 | go-version: 1.16 17 | - name: lint 18 | shell: bash 19 | run: | 20 | # binary will be $(go env GOPATH)/bin/golangci-lint 21 | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.41.1 22 | # or install it into ./bin/ 23 | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s v1.41.1 24 | # In alpine linux (as it does not come with curl by default) 25 | wget -O- -nv https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s v1.41.1 26 | golangci-lint --version 27 | 28 | make lint 29 | 30 | unit-test: 31 | needs: lint 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@v2 35 | - name: Set up Go 36 | uses: actions/setup-go@v2 37 | with: 38 | go-version: 1.16 39 | - name: Test 40 | shell: bash 41 | run: | 42 | # download etcd to bootstrap test environment 43 | curl -L https://github.com/kubernetes-sigs/kubebuilder/releases/download/v2.3.2/kubebuilder_2.3.2_linux_amd64.tar.gz | tar -xz -C /tmp/ 44 | mv /tmp/kubebuilder_2.3.2_linux_amd64 /tmp/kubebuilder 45 | export KUBEBUILDER_ASSETS=/tmp/kubebuilder/bin 46 | make test 47 | 48 | build: 49 | needs: unit-test 50 | runs-on: ubuntu-latest 51 | steps: 52 | - uses: actions/checkout@v2 53 | - name: Set up Go 54 | uses: actions/setup-go@v2 55 | with: 56 | go-version: 1.16 57 | - name: Build Bin 58 | run: make build -------------------------------------------------------------------------------- /hack/update_replicas.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "log" 7 | 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 10 | "k8s.io/apimachinery/pkg/runtime/schema" 11 | "k8s.io/client-go/dynamic" 12 | ctrl "sigs.k8s.io/controller-runtime" 13 | 14 | "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 15 | ) 16 | 17 | var ( 18 | namespace string 19 | jobname string 20 | replicas int 21 | ) 22 | 23 | func main() { 24 | flag.StringVar(&namespace, "ns", "default", "The namespace of the scaling job.") 25 | flag.StringVar(&jobname, "n", "gobigger-test", "The name of the scaling job.") 26 | flag.IntVar(&replicas, "r", 1, "The number of replicas for the job.") 27 | flag.Parse() 28 | cfg, err := ctrl.GetConfig() 29 | if err != nil { 30 | log.Fatalf("Failed to get kubeconfig: %v", err) 31 | } 32 | 33 | // create dynamic client for dijob 34 | dclient := dynamic.NewForConfigOrDie(cfg) 35 | gvr := schema.GroupVersionResource{ 36 | Group: v2alpha1.GroupVersion.Group, 37 | Version: v2alpha1.GroupVersion.Version, 38 | Resource: "dijobs", 39 | } 40 | diclient := dclient.Resource(gvr) 41 | 42 | unjob, err := diclient.Namespace(namespace).Get(context.Background(), jobname, metav1.GetOptions{}) 43 | if err != nil { 44 | log.Fatalf("Failed to get job with dynamic client: %v", err) 45 | } 46 | // set job.status.replicas to what we want 47 | err = unstructured.SetNestedField(unjob.Object, int64(replicas), "status", "replicas") 48 | if err != nil { 49 | log.Fatalf("Failed to set nested fields") 50 | } 51 | // update job status 52 | _, err = diclient.Namespace(namespace).UpdateStatus(context.Background(), unjob, metav1.UpdateOptions{}) 53 | if err != nil { 54 | log.Fatalf("Failed to update status: %v", err) 55 | } 56 | log.Printf("Successfully update dijob %s/%s replicas to %d", namespace, jobname, replicas) 57 | } 58 | -------------------------------------------------------------------------------- /test/e2e/jobs/name-validate-none-type-task-without-name.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: validate-none-type-task-without-name 5 | spec: 6 | group: "diengine" # 标记job所属的group,保留字段 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | tasks: 12 | - replicas: 1 13 | name: "learner" # learner task without name 14 | type: learner 15 | template: 16 | spec: 17 | containers: 18 | - name: di-container 19 | image: alpine:3.9 20 | command: ["/bin/sh", "-c",] 21 | args: 22 | - | 23 | echo $DI_NODES 24 | echo $DI_LEARNER_NODES 25 | restartPolicy: Never 26 | - replicas: 1 27 | name: "evaluator" 28 | type: evaluator 29 | template: 30 | spec: 31 | containers: 32 | - name: di-container 33 | image: alpine:3.9 34 | command: ["/bin/sh", "-c",] 35 | args: 36 | - | 37 | echo $DI_NODES 38 | echo $DI_EVALUATOR_NODES 39 | restartPolicy: Never 40 | - replicas: 2 41 | name: "collector" 42 | type: collector 43 | template: 44 | spec: 45 | containers: 46 | - name: di-container 47 | image: alpine:3.9 48 | command: ["/bin/sh", "-c",] 49 | args: 50 | - | 51 | echo $DI_NODES 52 | echo $DI_COLLECTOR_NODES 53 | restartPolicy: Never 54 | - replicas: 1 55 | # name: "none" 56 | type: none 57 | template: 58 | spec: 59 | containers: 60 | - name: di-container 61 | image: alpine:3.9 62 | command: ["/bin/sh", "-c",] 63 | args: 64 | - | 65 | echo $DI_NODES 66 | echo $DI_COLLECTOR_NODES 67 | restartPolicy: Never -------------------------------------------------------------------------------- /cmd/root.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package cmd 17 | 18 | import ( 19 | "flag" 20 | 21 | "github.com/spf13/cobra" 22 | "github.com/spf13/pflag" 23 | 24 | "opendilab.org/di-orchestrator/cmd/common" 25 | "opendilab.org/di-orchestrator/cmd/operator" 26 | "opendilab.org/di-orchestrator/cmd/server" 27 | ) 28 | 29 | // rootCmd represents the base command when called without any subcommands 30 | var ( 31 | rootCmd = &cobra.Command{ 32 | Use: "di-orchestrator", 33 | Short: "A component responsible for managing DI-engine jobs.", 34 | Long: `DI Orchestrator is a component responsible for managing DI-engine jobs in Kubernetes cluster. 35 | This application allows you to run di-operator, di-server and di-webhook locally with comandline.`, 36 | // Uncomment the following line if your bare application 37 | // has an action associated with it: 38 | Run: func(cmd *cobra.Command, args []string) { 39 | cmd.Help() 40 | }, 41 | } 42 | ) 43 | 44 | // Execute adds all child commands to the root command and sets flags appropriately. 45 | // This is called by main.main(). It only needs to happen once to the rootCmd. 46 | func Execute() { 47 | cobra.CheckErr(rootCmd.Execute()) 48 | } 49 | 50 | func init() { 51 | genFlags := common.NewGenericFlags() 52 | genFlags.AddFlags(rootCmd) 53 | rootCmd.AddCommand(server.NewCmdServer(genFlags)) 54 | rootCmd.AddCommand(operator.NewCmdOperator(genFlags)) 55 | // add all the flags in go flagset into pflagset 56 | pflag.CommandLine.AddGoFlagSet(flag.CommandLine) 57 | } 58 | -------------------------------------------------------------------------------- /pkg/allocator/job.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "k8s.io/apimachinery/pkg/types" 6 | ditypes "opendilab.org/di-orchestrator/pkg/allocator/types" 7 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 8 | "opendilab.org/di-orchestrator/pkg/common" 9 | diutil "opendilab.org/di-orchestrator/pkg/utils" 10 | ) 11 | 12 | func getJobInfo(job *div2alpha1.DIJob) (ditypes.JobInfo, error) { 13 | res, err := getJobResources(job) 14 | if err != nil { 15 | return ditypes.JobInfo{}, err 16 | } 17 | // TODO(liqingping): 重新定义 job info 18 | jobinfo := ditypes.NewJobInfo( 19 | types.NamespacedName{ 20 | Namespace: job.Namespace, Name: job.Name, 21 | }, 22 | res, 1, 1, 23 | job.Spec.Preemptible, 24 | ) 25 | return *jobinfo, nil 26 | } 27 | 28 | func getJobResources(job *div2alpha1.DIJob) (corev1.ResourceRequirements, error) { 29 | res, err := common.GetDIJobDefaultResources() 30 | if err != nil { 31 | return corev1.ResourceRequirements{}, err 32 | } 33 | for index := 0; index < len(job.Spec.Tasks); index++ { 34 | jobres := diutil.GetPodResources(&job.Spec.Tasks[index].Template.Spec) 35 | if jobres.Requests != nil { 36 | if jobres.Requests.Cpu() != nil { 37 | res.Requests[corev1.ResourceCPU] = *jobres.Requests.Cpu() 38 | } 39 | if jobres.Requests.Memory() != nil { 40 | res.Requests[corev1.ResourceMemory] = *jobres.Requests.Memory() 41 | } 42 | res.Requests[corev1.ResourceName(common.ResourceGPU)] = jobres.Requests[corev1.ResourceName(common.ResourceGPU)] 43 | } else if jobres.Limits != nil { 44 | if jobres.Limits.Cpu() != nil { 45 | res.Limits[corev1.ResourceCPU] = *jobres.Limits.Cpu() 46 | } 47 | if jobres.Limits.Memory() != nil { 48 | res.Limits[corev1.ResourceMemory] = *jobres.Limits.Memory() 49 | } 50 | res.Limits[corev1.ResourceName(common.ResourceGPU)] = jobres.Limits[corev1.ResourceName(common.ResourceGPU)] 51 | } 52 | } 53 | 54 | if _, ok := res.Requests[corev1.ResourceName(common.ResourceGPU)]; !ok { 55 | res.Requests[corev1.ResourceName(common.ResourceGPU)] = res.Limits[corev1.ResourceName(common.ResourceGPU)] 56 | } 57 | return res, nil 58 | } 59 | -------------------------------------------------------------------------------- /test/e2e/e2e_suite_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "testing" 7 | 8 | . "github.com/onsi/ginkgo" 9 | . "github.com/onsi/gomega" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/runtime" 12 | "k8s.io/client-go/kubernetes" 13 | "k8s.io/client-go/tools/record" 14 | ctrl "sigs.k8s.io/controller-runtime" 15 | "sigs.k8s.io/controller-runtime/pkg/client" 16 | "sigs.k8s.io/controller-runtime/pkg/envtest/printer" 17 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 18 | 19 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 20 | dicontext "opendilab.org/di-orchestrator/pkg/context" 21 | ) 22 | 23 | var ( 24 | dictx dicontext.Context 25 | clientset *kubernetes.Clientset 26 | 27 | exampleJobsDir string 28 | serviceDomainName string 29 | ) 30 | 31 | func TestE2E(t *testing.T) { 32 | RegisterFailHandler(Fail) 33 | 34 | RunSpecsWithDefaultAndCustomReporters(t, 35 | "E2E Suite", 36 | []Reporter{printer.NewlineReporter{}}) 37 | } 38 | 39 | func init() { 40 | testing.Init() 41 | flag.StringVar(&exampleJobsDir, "example-jobs-dir", "./jobs", "dir to the example jobs") 42 | flag.StringVar(&serviceDomainName, "service-domain-name", "svc.cluster.local", "k8s domain name") 43 | flag.Parse() 44 | } 45 | 46 | var _ = BeforeSuite(func() { 47 | scheme := runtime.NewScheme() 48 | div2alpha1.AddToScheme(scheme) 49 | corev1.AddToScheme(scheme) 50 | 51 | cfg := ctrl.GetConfigOrDie() 52 | logger := zap.New(zap.UseFlagOptions(&zap.Options{Development: true})) 53 | ctrl.SetLogger(logger) 54 | clients, err := client.New(cfg, client.Options{Scheme: scheme}) 55 | Expect(err).NotTo(HaveOccurred()) 56 | recorder := record.NewBroadcaster().NewRecorder(clients.Scheme(), corev1.EventSource{Component: "di-orchestrator", Host: "localhost"}) 57 | dictx = dicontext.NewContext(cfg, clients, recorder, logger) 58 | 59 | clientset = kubernetes.NewForConfigOrDie(cfg) 60 | Expect(err).NotTo(HaveOccurred()) 61 | 62 | dictx.DeleteAllOf(context.Background(), &div2alpha1.DIJob{}, 63 | client.InNamespace(namespace), client.MatchingLabels{"stability-test": "dijobs"}) 64 | }) 65 | 66 | var _ = AfterSuite(func() { 67 | dictx.DeleteAllOf(context.Background(), &div2alpha1.DIJob{}, 68 | client.InNamespace(namespace), client.MatchingLabels{"stability-test": "dijobs"}) 69 | }) 70 | -------------------------------------------------------------------------------- /pkg/server/handlers.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/gin-gonic/gin" 7 | k8serrors "k8s.io/apimachinery/pkg/api/errors" 8 | 9 | servertypes "opendilab.org/di-orchestrator/pkg/server/types" 10 | ) 11 | 12 | // get replicas api 13 | func (s *DIServer) getReplicas(c *gin.Context) { 14 | reps, err := s.p.GetReplicas(c) 15 | data, statusCode := s.buildResponse(reps, "successfully get replicas", err) 16 | c.JSON(statusCode, data) 17 | } 18 | 19 | // add replicas api 20 | func (s *DIServer) addReplicas(c *gin.Context) { 21 | reps, err := s.p.AddReplicas(c) 22 | data, statusCode := s.buildResponse(reps, "successfully add replicas", err) 23 | c.JSON(statusCode, data) 24 | } 25 | 26 | // delete replicas api 27 | func (s *DIServer) deleteReplicas(c *gin.Context) { 28 | reps, err := s.p.DeleteReplicas(c) 29 | data, statusCode := s.buildResponse(reps, "successfully delete replicas", err) 30 | c.JSON(statusCode, data) 31 | } 32 | 33 | // post profilings api 34 | func (s *DIServer) profilings(c *gin.Context) { 35 | _, err := s.p.PostProfilings(c) 36 | data, statusCode := s.buildResponse(nil, "successfully report profilings", err) 37 | c.JSON(statusCode, data) 38 | } 39 | 40 | func (s *DIServer) buildResponse(reps servertypes.Object, msg string, err error) (servertypes.Response, int) { 41 | log := s.ctx.Log.WithName("DIServer") 42 | 43 | var success bool = true 44 | var code int = servertypes.CodeSuccess 45 | var statusCode int = http.StatusOK 46 | if err != nil { 47 | success = false 48 | code = servertypes.CodeFailed 49 | msg = err.Error() 50 | 51 | // define status code 52 | if servertypes.IsNotFound(err) || k8serrors.IsNotFound(err) { 53 | statusCode = http.StatusNotFound 54 | } else if servertypes.IsAlreadyExists(err) || k8serrors.IsAlreadyExists(err) { 55 | statusCode = http.StatusConflict 56 | } else if servertypes.IsBadRequest(err) || k8serrors.IsBadRequest(err) { 57 | statusCode = http.StatusBadRequest 58 | } else if servertypes.IsNotImplemented(err) { 59 | statusCode = http.StatusNotImplemented 60 | } else { 61 | statusCode = http.StatusInternalServerError 62 | } 63 | log.Error(err, "failed to process request") 64 | } 65 | 66 | // build response 67 | rep := servertypes.Response{ 68 | Success: success, 69 | Code: code, 70 | Message: msg, 71 | Data: reps, 72 | } 73 | return rep, statusCode 74 | } 75 | -------------------------------------------------------------------------------- /pkg/utils/testutils/dijob.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 8 | dicommon "opendilab.org/di-orchestrator/pkg/common" 9 | ) 10 | 11 | func NewDIJob() *div2alpha1.DIJob { 12 | return &div2alpha1.DIJob{ 13 | TypeMeta: metav1.TypeMeta{ 14 | Kind: div2alpha1.KindDIJob, 15 | APIVersion: div2alpha1.GroupVersion.String(), 16 | }, 17 | ObjectMeta: metav1.ObjectMeta{ 18 | Name: DIJobName, 19 | Namespace: DIJobNamespace, 20 | }, 21 | Spec: div2alpha1.DIJobSpec{ 22 | Preemptible: false, 23 | Tasks: []div2alpha1.Task{ 24 | { 25 | Name: "task1", 26 | Type: "learner", 27 | Replicas: 1, 28 | Template: corev1.PodTemplateSpec{ 29 | Spec: corev1.PodSpec{ 30 | Containers: []corev1.Container{ 31 | { 32 | Name: dicommon.DefaultContainerName, 33 | Image: DIJobImage, 34 | Command: []string{"/bin/sh", "-c", "sleep", DefaultSleepDuration}, 35 | }, 36 | }, 37 | }, 38 | }, 39 | }, 40 | { 41 | Name: "task2", 42 | Type: "evaluator", 43 | Replicas: 1, 44 | Template: corev1.PodTemplateSpec{ 45 | Spec: corev1.PodSpec{ 46 | Containers: []corev1.Container{ 47 | { 48 | Name: dicommon.DefaultContainerName, 49 | Image: DIJobImage, 50 | Command: []string{"/bin/sh", "-c", "sleep", DefaultSleepDuration}, 51 | }, 52 | }, 53 | }, 54 | }, 55 | }, 56 | { 57 | Name: "task3", 58 | Type: "collector", 59 | Replicas: 2, 60 | Template: corev1.PodTemplateSpec{ 61 | Spec: corev1.PodSpec{ 62 | Containers: []corev1.Container{ 63 | { 64 | Name: dicommon.DefaultContainerName, 65 | Image: DIJobImage, 66 | Command: []string{"/bin/sh", "-c", "sleep", DefaultSleepDuration}, 67 | }, 68 | }, 69 | }, 70 | }, 71 | }, 72 | }, 73 | }, 74 | } 75 | } 76 | 77 | func NewNamespace(namespace string) *corev1.Namespace { 78 | return &corev1.Namespace{ 79 | TypeMeta: metav1.TypeMeta{ 80 | APIVersion: "v1", 81 | Kind: "Namespace", 82 | }, 83 | ObjectMeta: metav1.ObjectMeta{ 84 | Name: namespace, 85 | }, 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /pkg/context/node.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "context" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | resourcehelper "k8s.io/kubectl/pkg/util/resource" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | ) 10 | 11 | func (c *Context) ListNodes(ctx context.Context) ([]*corev1.Node, error) { 12 | nodeList := corev1.NodeList{} 13 | if err := c.Client.List(ctx, &nodeList, &client.ListOptions{}); err != nil { 14 | return nil, err 15 | } 16 | nodes := []*corev1.Node{} 17 | for _, node := range nodeList.Items { 18 | nodes = append(nodes, node.DeepCopy()) 19 | } 20 | return nodes, nil 21 | } 22 | 23 | func (c *Context) GetNodeAllocatedResources(node *corev1.Node, pods []*corev1.Pod) (reqs corev1.ResourceList, limits corev1.ResourceList, err error) { 24 | nonTerminatedPods := filterOutIneffectivePods(pods) 25 | reqs, limits = getPodsTotalRequestsAndLimits(nonTerminatedPods) 26 | return 27 | } 28 | 29 | func filterOutIneffectivePods(pods []*corev1.Pod) []*corev1.Pod { 30 | effectivePods := make([]*corev1.Pod, 0) 31 | for _, pod := range pods { 32 | if pod.Status.Phase == corev1.PodUnknown || pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { 33 | continue 34 | } else if pod.Status.Phase == corev1.PodPending { 35 | for _, condition := range pod.Status.Conditions { 36 | if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionTrue { 37 | effectivePods = append(effectivePods, pod) 38 | } 39 | } 40 | } else if pod.Status.Phase == corev1.PodRunning { 41 | effectivePods = append(effectivePods, pod) 42 | } 43 | } 44 | return effectivePods 45 | } 46 | 47 | func getPodsTotalRequestsAndLimits(podList []*corev1.Pod) (reqs corev1.ResourceList, limits corev1.ResourceList) { 48 | reqs, limits = corev1.ResourceList{}, corev1.ResourceList{} 49 | for _, pod := range podList { 50 | podReqs, podLimits := resourcehelper.PodRequestsAndLimits(pod) 51 | for podReqName, podReqValue := range podReqs { 52 | if value, ok := reqs[podReqName]; !ok { 53 | reqs[podReqName] = podReqValue.DeepCopy() 54 | } else { 55 | value.Add(podReqValue) 56 | reqs[podReqName] = value 57 | } 58 | } 59 | for podLimitName, podLimitValue := range podLimits { 60 | if value, ok := limits[podLimitName]; !ok { 61 | limits[podLimitName] = podLimitValue.DeepCopy() 62 | } else { 63 | value.Add(podLimitValue) 64 | limits[podLimitName] = value 65 | } 66 | } 67 | } 68 | return 69 | } 70 | -------------------------------------------------------------------------------- /pkg/utils/testutils/pod.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "io" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/apimachinery/pkg/types" 11 | "k8s.io/client-go/kubernetes" 12 | "sigs.k8s.io/controller-runtime/pkg/client" 13 | 14 | dicommon "opendilab.org/di-orchestrator/pkg/common" 15 | dicontext "opendilab.org/di-orchestrator/pkg/context" 16 | ) 17 | 18 | func NewPod(name, namespace string, ownRefer metav1.OwnerReference) *corev1.Pod { 19 | pod := &corev1.Pod{ 20 | TypeMeta: metav1.TypeMeta{ 21 | APIVersion: "v1", 22 | Kind: "Pod", 23 | }, 24 | ObjectMeta: metav1.ObjectMeta{ 25 | Name: name, 26 | Namespace: namespace, 27 | }, 28 | Spec: corev1.PodSpec{ 29 | Containers: []corev1.Container{ 30 | { 31 | Name: dicommon.DefaultContainerName, 32 | Image: DIJobImage, 33 | Command: []string{"/bin/sh", "-c", "sleep", DefaultSleepDuration}, 34 | }, 35 | }, 36 | }, 37 | } 38 | pod.SetOwnerReferences([]metav1.OwnerReference{ownRefer}) 39 | return pod 40 | } 41 | 42 | func UpdatePodPhase(ctx dicontext.Context, podKey types.NamespacedName, phase corev1.PodPhase) error { 43 | var pod corev1.Pod 44 | err := ctx.Get(context.TODO(), podKey, &pod) 45 | if err != nil { 46 | return err 47 | } 48 | 49 | containerName := pod.Spec.Containers[0].Name 50 | pod.Status.Phase = phase 51 | if phase == corev1.PodRunning { 52 | state := corev1.ContainerStateRunning{} 53 | cstatus := corev1.ContainerStatus{Name: containerName, State: corev1.ContainerState{ 54 | Running: &state, 55 | }, Ready: true} 56 | pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, cstatus) 57 | } 58 | 59 | err = ctx.Status().Update(context.TODO(), &pod, &client.UpdateOptions{}) 60 | if err != nil { 61 | return err 62 | } 63 | return nil 64 | } 65 | 66 | func GetPodLogs(clientSet *kubernetes.Clientset, 67 | namespace string, podName string, containerName string, follow bool) (string, error) { 68 | podLogOptions := corev1.PodLogOptions{ 69 | Container: containerName, 70 | Follow: follow, 71 | } 72 | 73 | podLogRequest := clientSet.CoreV1(). 74 | Pods(namespace). 75 | GetLogs(podName, &podLogOptions) 76 | stream, err := podLogRequest.Stream(context.TODO()) 77 | if err != nil { 78 | return "", err 79 | } 80 | defer stream.Close() 81 | 82 | buf := new(bytes.Buffer) 83 | _, err = io.Copy(buf, stream) 84 | if err != nil { 85 | return "", err 86 | } 87 | str := buf.String() 88 | return str, nil 89 | } 90 | -------------------------------------------------------------------------------- /pkg/common/config.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | "k8s.io/apimachinery/pkg/api/resource" 10 | ) 11 | 12 | const ( 13 | // labels for pods 14 | LabelOperator = "diengine/operator" 15 | LabelJob = "diengine/job" 16 | LabelTaskType = "diengine/task-type" 17 | LabelTaskName = "diengine/task-name" 18 | LabelPod = "diengine/pod" 19 | 20 | // annotations for pods 21 | AnnotationReplicas = "diengine/replicas" 22 | AnnotationRank = "diengine/rank" 23 | AnnotationTaskRank = "diengine/task-rank" 24 | AnnotationNode = "diengine/node" 25 | 26 | // envs for orchestrator 27 | ENVServerURL = "DI_SERVER_URL" 28 | // envs for pods 29 | ENVJobID = "DI_JOB_ID" 30 | ENVRank = "DI_RANK" 31 | ENVNodes = "DI_NODES" 32 | ENVTaskNodesFormat = "DI_%s_NODES" 33 | 34 | // dijob oriented 35 | OperatorName = "di-operator" 36 | DefaultContainerName = "di-container" 37 | DefaultPortName = "di-port" 38 | DefaultPort = 22270 39 | 40 | // system oriented 41 | ResourceGPU = "nvidia.com/gpu" 42 | ) 43 | 44 | var ( 45 | // k8s service domain name 46 | svcDomainName = "svc.cluster.local" 47 | 48 | // di server access url 49 | diServerURL = fmt.Sprintf("http://di-server.di-system.%s:8081", svcDomainName) 50 | ) 51 | 52 | func GetDIServerURL() string { 53 | return diServerURL 54 | } 55 | 56 | func SetDIServerURL(serverURL string) { 57 | diServerURL = serverURL 58 | } 59 | 60 | func GetServiceDomainName() string { 61 | return svcDomainName 62 | } 63 | 64 | func SetServiceDomainName(domainName string) { 65 | svcDomainName = domainName 66 | } 67 | 68 | func GetDIJobDefaultResources() (corev1.ResourceRequirements, error) { 69 | defaultResource := corev1.ResourceList{ 70 | corev1.ResourceCPU: resource.MustParse("1"), 71 | corev1.ResourceMemory: resource.MustParse("2Gi"), 72 | } 73 | resjson := os.Getenv("DI_JOB_DEFAULT_RESOURCES") 74 | if resjson == "" { 75 | return corev1.ResourceRequirements{Requests: defaultResource, Limits: defaultResource}, nil 76 | } 77 | resourceRequire := map[string]corev1.ResourceRequirements{} 78 | if err := json.Unmarshal([]byte(resjson), &resourceRequire); err != nil { 79 | return corev1.ResourceRequirements{}, fmt.Errorf("failed to unmarshal resource requirements: %v", err) 80 | } 81 | if _, ok := resourceRequire["resources"]; !ok { 82 | return corev1.ResourceRequirements{}, fmt.Errorf("failed to unmarshal resource requirements") 83 | } 84 | return resourceRequire["resources"], nil 85 | } 86 | -------------------------------------------------------------------------------- /e2e/e2e_suite_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | // import ( 4 | // "context" 5 | // "flag" 6 | // "os" 7 | // "path/filepath" 8 | // "testing" 9 | 10 | // . "github.com/onsi/ginkgo" 11 | // . "github.com/onsi/gomega" 12 | // "k8s.io/client-go/kubernetes" 13 | // "k8s.io/client-go/kubernetes/scheme" 14 | // "k8s.io/client-go/tools/clientcmd" 15 | // "sigs.k8s.io/controller-runtime/pkg/client" 16 | // "sigs.k8s.io/controller-runtime/pkg/envtest/printer" 17 | 18 | // div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 19 | // ) 20 | 21 | // func TestE2E(t *testing.T) { 22 | // RegisterFailHandler(Fail) 23 | 24 | // RunSpecsWithDefaultAndCustomReporters(t, 25 | // "E2E Suite", 26 | // []Reporter{printer.NewlineReporter{}}) 27 | // } 28 | 29 | // var ( 30 | // k8sClient client.Client 31 | // clientset *kubernetes.Clientset 32 | 33 | // kubeconfig string 34 | // exampleJobsDir string 35 | // sharedVolumesDir string 36 | // ) 37 | 38 | // func init() { 39 | // testing.Init() 40 | 41 | // if flag.Lookup("kubeconfig") == nil { 42 | // flag.StringVar(&kubeconfig, "kubeconfig", "", "kubeconfig file path") 43 | // } 44 | // flag.StringVar(&sharedVolumesDir, "shared-volumes-dir", "/data/nfs/ding/", "dir to shared volumes") 45 | // flag.StringVar(&exampleJobsDir, "example-jobs-dir", "./config", "dir to the example jobs") 46 | // flag.Parse() 47 | 48 | // kubeconfig = flag.Lookup("kubeconfig").Value.String() 49 | 50 | // if kubeconfig == "" { 51 | // kubeconfig = os.Getenv("KUBECONFIG") 52 | // if kubeconfig == "" { 53 | // kubeconfig = filepath.Join(homeDir(), ".kube", "config") 54 | // } 55 | // } 56 | // } 57 | 58 | // func homeDir() string { 59 | // if h := os.Getenv("HOME"); h != "" { 60 | // return h 61 | // } 62 | // return os.Getenv("USERPROFILE") // windows 63 | // } 64 | 65 | // var _ = BeforeSuite(func() { 66 | // // uses the current context in kubeconfig 67 | // cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) 68 | // Expect(err).NotTo(HaveOccurred()) 69 | // err = div2alpha1.AddToScheme(scheme.Scheme) 70 | // Expect(err).NotTo(HaveOccurred()) 71 | 72 | // //+kubebuilder:scaffold:scheme 73 | 74 | // k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) 75 | // Expect(err).NotTo(HaveOccurred()) 76 | // Expect(k8sClient).NotTo(BeNil()) 77 | 78 | // clientset, err = kubernetes.NewForConfig(cfg) 79 | // Expect(err).NotTo(HaveOccurred()) 80 | 81 | // k8sClient.DeleteAllOf(context.Background(), &div2alpha1.DIJob{}, 82 | // client.InNamespace(namespace), client.MatchingLabels{"stability-test": "dijobs"}) 83 | // }) 84 | 85 | // var _ = AfterSuite(func() { 86 | // }) 87 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | # When its time to do a release do a full cross platform build for all supported 4 | # architectures and push all of them to Docker Hub. 5 | # Only trigger on semver shaped tags. 6 | on: [push] 7 | 8 | env: 9 | version: v1.1.3 10 | 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | platform: [ linux/amd64 ] 17 | target: [ di-orchestrator ] 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | 22 | - name: Prepare 23 | id: prep 24 | env: 25 | DOCKERIO_ORG: opendilab 26 | TARGET: ${{ matrix.target }} 27 | run: | 28 | DOCKER_IMAGE=$DOCKERIO_ORG/$TARGET 29 | VERSION=${version}-nightly 30 | if [[ $GITHUB_REF == refs/tags/* ]]; then 31 | VERSION=${GITHUB_REF#refs/tags/} 32 | fi 33 | if [ "${{ github.event_name }}" = "schedule" ]; then 34 | VERSION=edge 35 | fi 36 | TAGS="${DOCKER_IMAGE}:${VERSION}" 37 | if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then 38 | TAGS="$TAGS,${DOCKER_IMAGE}:latest" 39 | fi 40 | echo ::set-output name=tags::${TAGS} 41 | 42 | - name: Set up QEMU 43 | uses: docker/setup-qemu-action@v1 44 | with: 45 | platforms: all 46 | 47 | - name: Set up Docker Buildx 48 | id: buildx 49 | uses: docker/setup-buildx-action@v1 50 | 51 | - name: Cache Docker layers 52 | uses: actions/cache@v2 53 | with: 54 | path: /tmp/.buildx-cache 55 | key: ${{ runner.os }}-buildx-${{ github.sha }} 56 | restore-keys: | 57 | ${{ runner.os }}-buildx- 58 | 59 | - name: Login to DockerHub 60 | if: github.event_name != 'pull_request' 61 | uses: docker/login-action@v1 62 | with: 63 | username: ${{ secrets.DOCKERIO_USERNAME }} 64 | password: ${{ secrets.DOCKERIO_PASSWORD }} 65 | 66 | - name: Build and push 67 | id: docker_build 68 | uses: docker/build-push-action@v2 69 | with: 70 | builder: ${{ steps.buildx.outputs.name }} 71 | context: ./ 72 | file: ./Dockerfile 73 | push: ${{ github.event_name != 'pull_request' }} 74 | tags: ${{ steps.prep.outputs.tags }} 75 | cache-from: type=local,src=/tmp/.buildx-cache 76 | cache-to: type=local,dest=/tmp/.buildx-cache 77 | # target: ${{ matrix.target }} 78 | 79 | - name: Image digest 80 | run: echo ${{ steps.docker_build.outputs.digest }} 81 | -------------------------------------------------------------------------------- /docs/developer-guide.md: -------------------------------------------------------------------------------- 1 | # Developer Guide 2 | 3 | ## Prerequisites 4 | 5 | - a well prepared kubernetes cluster. Follow the [instructions](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm/) to create a kubernetes cluster, or create a local kubernetes node referring to [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) or [minikube](https://minikube.sigs.k8s.io/docs/start/) 6 | - kustomize. Installed by the following command 7 | 8 | ```bash 9 | curl -s "https://raw.githubusercontent.com/\ 10 | kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash 11 | ``` 12 | 13 | ## CRD Design 14 | 15 | Update codes in [dijob_types.go](./api/v2alpha1/dijob_types.go) with your requirements, and generate deepcopy functions. 16 | 17 | ```bash 18 | make generate 19 | ``` 20 | 21 | Generate new CRD files with the following command. 22 | 23 | ```bash 24 | make manifests 25 | ``` 26 | 27 | New CRD files will be generated in [./config/crd/bases](./config/crd/bases) 28 | 29 | ## Controller Logic 30 | 31 | Referenced to [controllers](./pkg/controllers) 32 | 33 | ## DI Server Logic 34 | 35 | Referenced to [server](./pkg/server) 36 | 37 | ## Installation 38 | 39 | Run the following command in the project root directory. 40 | 41 | ```bash 42 | # build images. 43 | make docker-build 44 | make docker-push 45 | # deploy di-operator and server to cluster 46 | make dev-deploy 47 | ``` 48 | 49 | Since the CustomResourceDefinitions are too long, you will probably find the following error: 50 | 51 | ```bash 52 | The CustomResourceDefinition "dijobs.diengine.opendilab.org" is invalid: metadata.annotations: Too long: must have at most 262144 bytes 53 | ``` 54 | 55 | Then running the following command will solve the problem: 56 | 57 | ```bash 58 | kustomize build config/crd | kubectl create -f - 59 | ``` 60 | 61 | `di-operator` and `di-server` will be installed in `di-system` namespace. 62 | 63 | ```bash 64 | $ kubectl get pod -n di-system 65 | NAME READY STATUS RESTARTS AGE 66 | di-operator-57cc65d5c9-5vnvn 1/1 Running 0 59s 67 | di-server-7b86ff8df4-jfgmp 1/1 Running 0 59s 68 | ``` 69 | 70 | ## Programming Specification 71 | 72 | - Logger: logger should use `github.com/go-logr/logr.Logger`, created from `sigs.k8s.io/controller-runtime/pkg/log.DelegatingLogger`. We have the following specifications 73 | - Logger used in each function should be defined as: `logger := ctx.Log.WithName(function-name).WithValues("job", job-namespace-name))`. It's helpful for debugging since we can easily locate where the log message is from and what the DIJob is. Then, DIJob related information is not needed in log message. 74 | - All the log message should start with lower case letter. 75 | -------------------------------------------------------------------------------- /config/default/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: di-system 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | # namePrefix: di- 10 | 11 | # Labels to add to all resources and selectors. 12 | #commonLabels: 13 | # someName: someValue 14 | 15 | bases: 16 | - ../crd 17 | - ../rbac 18 | - ../manager 19 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 20 | # crd/kustomization.yaml 21 | # - ../webhook 22 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. 23 | # - ../certmanager 24 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. 25 | #- ../prometheus 26 | 27 | patchesStrategicMerge: 28 | # Protect the /metrics endpoint by putting it behind auth. 29 | # If you want your controller-manager to expose the /metrics 30 | # endpoint w/o any authn/z, please comment the following line. 31 | # - manager_auth_proxy_patch.yaml 32 | 33 | # Mount the controller config file for loading manager configurations 34 | # through a ComponentConfig type 35 | #- manager_config_patch.yaml 36 | 37 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 38 | # crd/kustomization.yaml 39 | # - manager_webhook_patch.yaml 40 | 41 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 42 | # Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. 43 | # 'CERTMANAGER' needs to be enabled to use ca injection 44 | # - webhookcainjection_patch.yaml 45 | 46 | vars: 47 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. 48 | # - name: CERTIFICATE_NAMESPACE # namespace of the certificate CR 49 | # objref: 50 | # kind: Certificate 51 | # group: cert-manager.io 52 | # version: v1 53 | # name: di-serving-cert # this name should match the one in certificate.yaml 54 | # fieldref: 55 | # fieldpath: metadata.namespace 56 | # - name: CERTIFICATE_NAME 57 | # objref: 58 | # kind: Certificate 59 | # group: cert-manager.io 60 | # version: v1 61 | # name: di-serving-cert # this name should match the one in certificate.yaml 62 | # - name: SERVICE_NAMESPACE # namespace of the service 63 | # objref: 64 | # kind: Service 65 | # version: v1 66 | # name: di-webhook-service 67 | # fieldref: 68 | # fieldpath: metadata.namespace 69 | # - name: SERVICE_NAME 70 | # objref: 71 | # kind: Service 72 | # version: v1 73 | # name: di-webhook-service -------------------------------------------------------------------------------- /config/samples/atari-dqn-dist.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: job-with-tasks 5 | spec: 6 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 7 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 8 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 9 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 10 | volumes: 11 | - name: cache-volume 12 | emptyDir: 13 | medium: Memory 14 | sizeLimit: 128Mi 15 | tasks: 16 | - replicas: 1 17 | name: "learner" 18 | type: learner 19 | template: 20 | spec: 21 | containers: 22 | - name: di-container 23 | image: opendilab/ding:v0.3.1-dist-test 24 | imagePullPolicy: IfNotPresent 25 | env: 26 | - name: NCCL_DEBUG 27 | value: "INFO" 28 | command: ["/bin/bash", "-c",] 29 | args: 30 | - | 31 | ditask --labels learner \ 32 | --package . \ 33 | --main dizoo.atari.example.atari_dqn_dist.main \ 34 | --parallel-workers 1 \ 35 | --topology mesh \ 36 | --platform k8s 37 | volumeMounts: 38 | - name: cache-volume 39 | mountPath: /dev/shm 40 | restartPolicy: Never 41 | - replicas: 1 42 | name: "evaluator" 43 | type: evaluator 44 | template: 45 | spec: 46 | containers: 47 | - name: di-container 48 | image: opendilab/ding:v0.3.1-dist-test 49 | imagePullPolicy: IfNotPresent 50 | env: 51 | - name: NCCL_DEBUG 52 | value: "INFO" 53 | command: ["/bin/bash", "-c",] 54 | args: 55 | - | 56 | ditask --labels evaluator \ 57 | --package . \ 58 | --main dizoo.atari.example.atari_dqn_dist.main \ 59 | --parallel-workers 1 \ 60 | --topology mesh \ 61 | --platform k8s 62 | volumeMounts: 63 | - name: cache-volume 64 | mountPath: /dev/shm 65 | restartPolicy: Never 66 | - replicas: 2 67 | name: "collector" 68 | type: collector 69 | template: 70 | spec: 71 | containers: 72 | - name: di-container 73 | image: opendilab/ding:v0.3.1-dist-test 74 | imagePullPolicy: IfNotPresent 75 | env: 76 | - name: NCCL_DEBUG 77 | value: "INFO" 78 | command: ["/bin/bash", "-c",] 79 | args: 80 | - | 81 | ditask --labels collector \ 82 | --package . \ 83 | --main dizoo.atari.example.atari_dqn_dist.main \ 84 | --parallel-workers 1 \ 85 | --topology mesh \ 86 | --platform k8s 87 | volumeMounts: 88 | - name: cache-volume 89 | mountPath: /dev/shm 90 | restartPolicy: Never -------------------------------------------------------------------------------- /pkg/common/config_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | 8 | . "github.com/onsi/ginkgo" 9 | . "github.com/onsi/gomega" 10 | "k8s.io/apimachinery/pkg/api/resource" 11 | "sigs.k8s.io/controller-runtime/pkg/envtest/printer" 12 | ) 13 | 14 | func TestConfig(t *testing.T) { 15 | RegisterFailHandler(Fail) 16 | 17 | RunSpecsWithDefaultAndCustomReporters(t, 18 | "CommonConfig Suite", 19 | []Reporter{printer.NewlineReporter{}}) 20 | } 21 | 22 | var _ = Describe("Test common config", func() { 23 | Context("Get DIJob default resources", func() { 24 | It("return the default resources", func() { 25 | type testCase struct { 26 | resource string 27 | expectCPU string 28 | expectMem string 29 | } 30 | testCases := []testCase{ 31 | {resource: `{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}`, expectCPU: "1", expectMem: "2Gi"}, 32 | {resource: `{"resources": {"requests": {"cpu": 2, "memory": "3Gi"}}}`, expectCPU: "2", expectMem: "3Gi"}, 33 | {resource: "", expectCPU: "1", expectMem: "2Gi"}, 34 | } 35 | for i := range testCases { 36 | c := testCases[i] 37 | By(fmt.Sprintf("Create the %dth DIJob", i+1)) 38 | err := os.Setenv("DI_JOB_DEFAULT_RESOURCES", c.resource) 39 | Expect(err).NotTo(HaveOccurred()) 40 | r, err := GetDIJobDefaultResources() 41 | Expect(err).NotTo(HaveOccurred()) 42 | Expect(r.Requests.Cpu().Equal(resource.MustParse(c.expectCPU))).Should(BeTrue()) 43 | Expect(r.Requests.Memory().Equal(resource.MustParse(c.expectMem))).Should(BeTrue()) 44 | } 45 | }) 46 | 47 | It("return k8s domain name", func() { 48 | type testCase struct { 49 | domainName string 50 | expectDomain string 51 | } 52 | testCases := []testCase{ 53 | {domainName: "svc.k8s.cluster", expectDomain: "svc.k8s.cluster"}, 54 | {domainName: "svc.cluster.local", expectDomain: "svc.cluster.local"}, 55 | } 56 | for i := range testCases { 57 | c := testCases[i] 58 | By(fmt.Sprintf("Create the %dth DIJob", i+1)) 59 | SetServiceDomainName(c.domainName) 60 | domainName := GetServiceDomainName() 61 | Expect(domainName).To(Equal(c.expectDomain)) 62 | } 63 | }) 64 | 65 | It("return server url", func() { 66 | type testCase struct { 67 | url string 68 | expectURL string 69 | } 70 | testCases := []testCase{ 71 | {url: "http://di-server.di-system.svc.cluster.local:8081", expectURL: "http://di-server.di-system.svc.cluster.local:8081"}, 72 | {url: "http://di-server.di-system.svc.cluster.local:8080", expectURL: "http://di-server.di-system.svc.cluster.local:8080"}, 73 | {url: "", expectURL: "http://di-server.di-system.svc.cluster.local:8081"}, 74 | } 75 | for i := range testCases { 76 | c := testCases[i] 77 | By(fmt.Sprintf("Create the %dth DIJob", i+1)) 78 | err := os.Setenv(ENVServerURL, c.url) 79 | Expect(err).NotTo(HaveOccurred()) 80 | SetDIServerURL(c.expectURL) 81 | url := GetDIServerURL() 82 | Expect(url).To(Equal(c.expectURL)) 83 | } 84 | }) 85 | }) 86 | }) 87 | -------------------------------------------------------------------------------- /chart/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | name: {{ .Release.Name }} 6 | name: {{ .Release.Name }} 7 | namespace: {{ .Release.Namespace }} 8 | --- 9 | apiVersion: rbac.authorization.k8s.io/v1 10 | kind: Role 11 | metadata: 12 | name: di-leader-election-role 13 | namespace: {{ .Release.Namespace }} 14 | rules: 15 | - apiGroups: 16 | - "" 17 | - coordination.k8s.io 18 | resources: 19 | - configmaps 20 | - leases 21 | verbs: 22 | - get 23 | - list 24 | - watch 25 | - create 26 | - update 27 | - patch 28 | - delete 29 | - apiGroups: 30 | - "" 31 | resources: 32 | - events 33 | verbs: 34 | - create 35 | - patch 36 | --- 37 | apiVersion: rbac.authorization.k8s.io/v1 38 | kind: ClusterRole 39 | metadata: 40 | name: di-metrics-reader 41 | rules: 42 | - nonResourceURLs: 43 | - /metrics 44 | verbs: 45 | - get 46 | --- 47 | apiVersion: rbac.authorization.k8s.io/v1 48 | kind: ClusterRole 49 | metadata: 50 | creationTimestamp: null 51 | name: di-operator-cluster-role 52 | rules: 53 | - apiGroups: 54 | - "" 55 | resources: 56 | - events 57 | - pods 58 | - services 59 | verbs: 60 | - create 61 | - delete 62 | - get 63 | - list 64 | - patch 65 | - update 66 | - watch 67 | - apiGroups: 68 | - "" 69 | resources: 70 | - namespaces 71 | - nodes 72 | verbs: 73 | - get 74 | - list 75 | - watch 76 | - apiGroups: 77 | - diengine.opendilab.org 78 | resources: 79 | - dijobs 80 | verbs: 81 | - create 82 | - delete 83 | - get 84 | - list 85 | - patch 86 | - update 87 | - watch 88 | - apiGroups: 89 | - diengine.opendilab.org 90 | resources: 91 | - dijobs/finalizers 92 | verbs: 93 | - update 94 | - apiGroups: 95 | - diengine.opendilab.org 96 | resources: 97 | - dijobs/status 98 | verbs: 99 | - get 100 | - patch 101 | - update 102 | --- 103 | apiVersion: rbac.authorization.k8s.io/v1 104 | kind: ClusterRole 105 | metadata: 106 | name: di-proxy-role 107 | rules: 108 | - apiGroups: 109 | - authentication.k8s.io 110 | resources: 111 | - tokenreviews 112 | verbs: 113 | - create 114 | - apiGroups: 115 | - authorization.k8s.io 116 | resources: 117 | - subjectaccessreviews 118 | verbs: 119 | - create 120 | --- 121 | apiVersion: rbac.authorization.k8s.io/v1 122 | kind: RoleBinding 123 | metadata: 124 | name: di-leader-election-rolebinding 125 | namespace: {{ .Release.Namespace }} 126 | roleRef: 127 | apiGroup: rbac.authorization.k8s.io 128 | kind: Role 129 | name: di-leader-election-role 130 | subjects: 131 | - kind: ServiceAccount 132 | name: {{ .Release.Name }} 133 | namespace: {{ .Release.Namespace }} 134 | --- 135 | apiVersion: rbac.authorization.k8s.io/v1 136 | kind: ClusterRoleBinding 137 | metadata: 138 | name: di-operator-cluster-rolebinding 139 | roleRef: 140 | apiGroup: rbac.authorization.k8s.io 141 | kind: ClusterRole 142 | name: di-operator-cluster-role 143 | subjects: 144 | - kind: ServiceAccount 145 | name: {{ .Release.Name }} 146 | namespace: {{ .Release.Namespace }} 147 | --- 148 | apiVersion: rbac.authorization.k8s.io/v1 149 | kind: ClusterRoleBinding 150 | metadata: 151 | name: di-proxy-rolebinding 152 | roleRef: 153 | apiGroup: rbac.authorization.k8s.io 154 | kind: ClusterRole 155 | name: di-proxy-role 156 | subjects: 157 | - kind: ServiceAccount 158 | name: {{ .Release.Name }} 159 | namespace: {{ .Release.Namespace }} -------------------------------------------------------------------------------- /chart/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | control-plane: {{ .Values.operatorName }} 6 | name: {{ .Values.operatorName }} 7 | namespace: {{ .Release.Namespace }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | control-plane: {{ .Values.operatorName }} 13 | template: 14 | metadata: 15 | labels: 16 | control-plane: {{ .Values.operatorName }} 17 | spec: 18 | serviceAccount: {{ .Release.Name }} 19 | containers: 20 | - args: 21 | - --zap-devel=true 22 | - --probe-addr=:8080 23 | - --metric-addr=:8443 24 | - --leader-elect 25 | - --qps={{ .Values.qps }} 26 | - --burst={{ .Values.burst }} 27 | - --service-domain-name={{ .Values.serviceDomainName }} 28 | - --di-server-url=http://{{ .Values.serverName }}.{{ .Release.Namespace }}.{{ .Values.serviceDomainName }}:{{ .Values.serverPort }} 29 | command: 30 | - /di-orchestrator 31 | - operator 32 | envFrom: 33 | - configMapRef: 34 | name: di-config 35 | image: {{ .Values.registry }}/di-orchestrator:{{ .Values.tag }} 36 | imagePullPolicy: Always 37 | livenessProbe: 38 | httpGet: 39 | path: /healthz 40 | port: 8080 41 | initialDelaySeconds: 15 42 | periodSeconds: 20 43 | name: manager 44 | readinessProbe: 45 | httpGet: 46 | path: /readyz 47 | port: 8080 48 | initialDelaySeconds: 5 49 | periodSeconds: 10 50 | resources: 51 | limits: 52 | cpu: 100m 53 | memory: 500Mi 54 | requests: 55 | cpu: 100m 56 | memory: 500Mi 57 | securityContext: 58 | allowPrivilegeEscalation: false 59 | terminationGracePeriodSeconds: 10 60 | --- 61 | apiVersion: apps/v1 62 | kind: Deployment 63 | metadata: 64 | labels: 65 | control-plane: {{ .Values.serverName }} 66 | name: {{ .Values.serverName }} 67 | namespace: {{ .Release.Namespace }} 68 | spec: 69 | replicas: 1 70 | selector: 71 | matchLabels: 72 | control-plane: {{ .Values.serverName }} 73 | template: 74 | metadata: 75 | labels: 76 | control-plane: {{ .Values.serverName }} 77 | spec: 78 | serviceAccount: {{ .Release.Name }} 79 | containers: 80 | - args: 81 | - --zap-devel=true 82 | - --server-bind-address=:{{ .Values.serverPort }} 83 | - --qps={{ .Values.qps }} 84 | - --burst={{ .Values.burst }} 85 | - --service-domain-name={{ .Values.serviceDomainName }} 86 | - --di-server-url=http://{{ .Values.serverName }}.{{ .Release.Namespace }}.{{ .Values.serviceDomainName }}:{{ .Values.serverPort }} 87 | command: 88 | - /di-orchestrator 89 | - server 90 | envFrom: 91 | - configMapRef: 92 | name: di-config 93 | image: {{ .Values.registry }}/di-orchestrator:{{ .Values.tag }} 94 | imagePullPolicy: Always 95 | livenessProbe: 96 | httpGet: 97 | path: /healthz 98 | port: 8080 99 | initialDelaySeconds: 15 100 | periodSeconds: 20 101 | name: server 102 | resources: 103 | limits: 104 | cpu: 100m 105 | memory: 500Mi 106 | requests: 107 | cpu: 100m 108 | memory: 500Mi 109 | securityContext: 110 | allowPrivilegeEscalation: false 111 | terminationGracePeriodSeconds: 10 112 | -------------------------------------------------------------------------------- /pkg/controllers/suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controllers 18 | 19 | import ( 20 | "fmt" 21 | "path/filepath" 22 | "testing" 23 | "time" 24 | 25 | . "github.com/onsi/ginkgo" 26 | "github.com/onsi/ginkgo/config" 27 | . "github.com/onsi/gomega" 28 | "k8s.io/apimachinery/pkg/runtime" 29 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 30 | ctrl "sigs.k8s.io/controller-runtime" 31 | "sigs.k8s.io/controller-runtime/pkg/envtest" 32 | "sigs.k8s.io/controller-runtime/pkg/envtest/printer" 33 | logf "sigs.k8s.io/controller-runtime/pkg/log" 34 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 35 | 36 | v2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 37 | dicontext "opendilab.org/di-orchestrator/pkg/context" 38 | //+kubebuilder:scaffold:imports 39 | ) 40 | 41 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 42 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 43 | 44 | const ( 45 | timeout = 5 * time.Second 46 | interval = 250 * time.Millisecond 47 | duration = 200 * time.Millisecond 48 | ) 49 | 50 | // var cfg *rest.Config 51 | var ( 52 | ctx dicontext.Context 53 | testEnv *envtest.Environment 54 | scheme = runtime.NewScheme() 55 | ) 56 | 57 | func TestControllers(t *testing.T) { 58 | RegisterFailHandler(Fail) 59 | 60 | RunSpecsWithDefaultAndCustomReporters(t, 61 | "DI-Controller Suite", 62 | []Reporter{printer.NewlineReporter{}}) 63 | } 64 | 65 | var _ = BeforeSuite(func() { 66 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 67 | 68 | By("bootstrapping test environment") 69 | testEnv = &envtest.Environment{ 70 | CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, 71 | ErrorIfCRDPathMissing: true, 72 | } 73 | 74 | cfg, err := testEnv.Start() 75 | Expect(err).NotTo(HaveOccurred()) 76 | Expect(cfg).NotTo(BeNil()) 77 | 78 | err = v2alpha1.AddToScheme(scheme) 79 | Expect(err).NotTo(HaveOccurred()) 80 | err = clientgoscheme.AddToScheme(scheme) 81 | Expect(err).NotTo(HaveOccurred()) 82 | 83 | //+kubebuilder:scaffold:scheme 84 | 85 | // create controller manager 86 | metricPort := config.GinkgoConfig.ParallelNode + 8200 87 | metricAddress := fmt.Sprintf(":%d", metricPort) 88 | k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ 89 | Scheme: scheme, 90 | MetricsBindAddress: metricAddress, 91 | }) 92 | Expect(err).NotTo(HaveOccurred()) 93 | 94 | ctx = dicontext.NewContext(cfg, 95 | k8sManager.GetClient(), 96 | k8sManager.GetEventRecorderFor("controller"), 97 | ctrl.Log.WithName("controller")) 98 | reconciler := NewDIJobReconciler(k8sManager.GetScheme(), ctx) 99 | if err = reconciler.SetupWithManager(k8sManager); err != nil { 100 | Expect(err).NotTo(HaveOccurred()) 101 | } 102 | 103 | Expect(err).NotTo(HaveOccurred()) 104 | 105 | // starting manager 106 | go func() { 107 | err = k8sManager.Start(ctrl.SetupSignalHandler()) 108 | Expect(err).NotTo(HaveOccurred()) 109 | }() 110 | }, 60) 111 | 112 | var _ = AfterSuite(func() { 113 | By("tearing down the test environment") 114 | err := testEnv.Stop() 115 | Expect(err).NotTo(HaveOccurred()) 116 | }) 117 | -------------------------------------------------------------------------------- /config/samples/atari-dqn-dist-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: diengine.opendilab.org/v2alpha1 2 | kind: DIJob 3 | metadata: 4 | name: job-with-tasks 5 | # generateName: gobigger-test- 6 | spec: 7 | priority: "normal" # 表示job的优先级,保留字段,调度中或许可以用到 8 | backoffLimit: 0 # 表示job允许的最大重启次数,可以为nil,表示无限重启;默认为3,可以通过status.restarts看到实际重启次数 9 | cleanPodPolicy: "Running" # 表示job运行完成之后,如何处理worker pods。Running表示job完成后删除所有还在Running的pods 10 | preemptible: false # 表示job是否允许被抢占,调度中对job资源改动之后涉及到抢占操作。目前只能设置为false 11 | volumes: 12 | - name: cache-volume 13 | emptyDir: 14 | medium: Memory 15 | sizeLimit: 128Mi 16 | - name: config-py 17 | configMap: 18 | name: config-py 19 | tasks: 20 | - replicas: 1 21 | name: "learner" 22 | type: learner 23 | template: 24 | spec: 25 | containers: 26 | - name: di-container 27 | image: opendilab/ding:v0.3.1-dist 28 | imagePullPolicy: IfNotPresent 29 | env: 30 | - name: NCCL_DEBUG 31 | value: "INFO" 32 | command: ["/bin/bash", "-c",] 33 | args: 34 | - | 35 | cat /etc/config/config.py 36 | ditask --labels learner \ 37 | --package . \ 38 | --main dizoo.atari.example.atari_dqn_dist.main \ 39 | --parallel-workers 2 \ 40 | --topology mesh \ 41 | --platform k8s 42 | volumeMounts: 43 | - name: cache-volume 44 | mountPath: /dev/shm 45 | - name: config-py 46 | mountPath: /etc/config 47 | restartPolicy: Never 48 | - replicas: 1 49 | name: "evaluator" 50 | type: evaluator 51 | template: 52 | spec: 53 | containers: 54 | - name: di-container 55 | image: opendilab/ding:v0.3.1-dist 56 | imagePullPolicy: IfNotPresent 57 | env: 58 | - name: NCCL_DEBUG 59 | value: "INFO" 60 | command: ["/bin/bash", "-c",] 61 | args: 62 | - | 63 | cat /etc/config/config.py 64 | ditask --labels evaluator \ 65 | --package . \ 66 | --main dizoo.atari.example.atari_dqn_dist.main \ 67 | --parallel-workers 2 \ 68 | --topology mesh \ 69 | --platform k8s 70 | volumeMounts: 71 | - name: cache-volume 72 | mountPath: /dev/shm 73 | - name: config-py 74 | mountPath: /etc/config 75 | restartPolicy: Never 76 | - replicas: 1 77 | name: "collector" 78 | type: collector 79 | template: 80 | spec: 81 | containers: 82 | - name: di-container 83 | image: opendilab/ding:v0.3.1-dist 84 | imagePullPolicy: IfNotPresent 85 | env: 86 | - name: NCCL_DEBUG 87 | value: "INFO" 88 | command: ["/bin/bash", "-c",] 89 | args: 90 | - | 91 | cat /etc/config/config.py 92 | ditask --labels collector \ 93 | --package . \ 94 | --main dizoo.atari.example.atari_dqn_dist.main \ 95 | --parallel-workers 2 \ 96 | --topology mesh \ 97 | --platform k8s 98 | volumeMounts: 99 | - name: cache-volume 100 | mountPath: /dev/shm 101 | - name: config-py 102 | mountPath: /etc/config 103 | restartPolicy: Never 104 | --- 105 | apiVersion: v1 106 | kind: ConfigMap 107 | metadata: 108 | name: config-py 109 | data: 110 | config.py: | 111 | from easydict import EasyDict 112 | 113 | cartpole_dqn_config = dict( 114 | exp_name='cartpole_dqn_seed0', 115 | env=dict( 116 | collector_env_num=8, 117 | evaluator_env_num=5, 118 | n_evaluator_episode=5, 119 | stop_value=195, 120 | replay_path='cartpole_dqn_seed0/video', 121 | ), 122 | ... 123 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build](https://github.com/opendilab/DI-orchestrator/actions/workflows/build.yaml/badge.svg?branch=main)](https://github.com/opendilab/DI-orchestrator/actions/workflows/build.yaml) [![Releases](https://github.com/opendilab/DI-orchestrator/actions/workflows/release.yaml/badge.svg)](https://github.com/opendilab/DI-orchestrator/actions/workflows/release.yaml) 2 | # DI Orchestrator 3 | 4 | DI Orchestrator is designed to manage DI ([Decision Intelligence](https://github.com/opendilab/DI-engine/)) jobs using Kubernetes Custom Resource and Operator. 5 | 6 | ### Prerequisites 7 | 8 | - A well-prepared kubernetes cluster. Follow the [instructions](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm/) to create a kubernetes cluster, or create a local kubernetes node referring to [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) or [minikube](https://minikube.sigs.k8s.io/docs/start/) 9 | 10 | ### Install DI Orchestrator 11 | 12 | DI Orchestrator consists of two components: `di-operator` and `di-server`. Install them with the following command. 13 | 14 | ```bash 15 | kubectl create -f ./config/di-manager.yaml 16 | ``` 17 | 18 | `di-operator` and `di-server` will be installed in `di-system` namespace. 19 | 20 | ```bash 21 | $ kubectl get pod -n di-system 22 | NAME READY STATUS RESTARTS AGE 23 | di-operator-57cc65d5c9-5vnvn 1/1 Running 0 59s 24 | di-server-7b86ff8df4-jfgmp 1/1 Running 0 59s 25 | ``` 26 | 27 | ### Submit DIJob 28 | 29 | ```bash 30 | # submit DIJob 31 | $ kubectl create -f config/samples/atari-dqn-tasks.yaml 32 | 33 | # get pod and you will see coordinator is created by di-operator 34 | # a few seconds later, you will see collectors and learners created by di-server 35 | $ kubectl get pod 36 | NAME READY STATUS RESTARTS AGE 37 | job-with-tasks-collector-0 1/1 Running 0 2s 38 | job-with-tasks-collector-1 1/1 Running 0 2s 39 | job-with-tasks-evaluator-0 1/1 Running 0 2s 40 | job-with-tasks-learner-0 1/1 Running 0 2s 41 | 42 | # get logs of tasks 43 | $ kubectl logs job-with-tasks-evaluator-0 44 | /opt/conda/lib/python3.8/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /opt/conda/conda-bld/pytorch_1607370172916/work/c10/cuda/CUDAFunctions.cpp:100.) 45 | return torch._C._cuda_getDeviceCount() > 0 46 | [06-28 08:25:29] INFO Evaluator running on node 1 func.py:58 47 | A.L.E: Arcade Learning Environment (version +a54a328) 48 | [Powered by Stella] 49 | /opt/conda/lib/python3.8/site-packages/ale_py/roms/__init__.py:44: UserWarning: ale_py.roms contains unsupported ROMs: /opt/conda/lib/python3.8/site-packages/AutoROM/roms/{joust.bin, warlords.bin, maze_craze.bin, combat.bin} 50 | warnings.warn( 51 | [06-28 08:25:46] INFO Evaluation: Train Iter(0) Env Step(0) Eval Reward(-21.000) func.py:58 52 | [06-28 08:25:46] WARNING You have not installed memcache package! DI-engine has changed to some alternatives. 53 | 54 | $ kubectl logs job-with-tasks-learner-0 55 | /opt/conda/lib/python3.8/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /opt/conda/conda-bld/pytorch_1607370172916/work/c10/cuda/CUDAFunctions.cpp:100.) 56 | return torch._C._cuda_getDeviceCount() > 0 57 | [06-28 08:25:27] INFO Learner running on node 0 58 | ``` 59 | 60 | ## User Guide 61 | 62 | Refers to [user-guide](./docs/architecture.md). For Chinese version, please refer to [中文手册](./docs/architecture-cn.md) 63 | 64 | ## Contributing 65 | 66 | Refers to [developer-guide](./docs/developer-guide.md). 67 | 68 | Contact us throw 69 | -------------------------------------------------------------------------------- /cmd/server/server.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package server 17 | 18 | import ( 19 | "flag" 20 | 21 | "github.com/spf13/cobra" 22 | "k8s.io/apimachinery/pkg/runtime" 23 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 24 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 25 | ctrl "sigs.k8s.io/controller-runtime" 26 | "sigs.k8s.io/controller-runtime/pkg/healthz" 27 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 28 | 29 | cmdcommon "opendilab.org/di-orchestrator/cmd/common" 30 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 31 | dicommon "opendilab.org/di-orchestrator/pkg/common" 32 | dicontext "opendilab.org/di-orchestrator/pkg/context" 33 | "opendilab.org/di-orchestrator/pkg/server" 34 | ) 35 | 36 | type CreateOptions struct { 37 | *cmdcommon.GenericFlags 38 | 39 | ServerBindAddress string 40 | ProbeAddress string 41 | MetricAddress string 42 | } 43 | 44 | func NewCreateOptions(genFlags *cmdcommon.GenericFlags) *CreateOptions { 45 | return &CreateOptions{ 46 | GenericFlags: genFlags, 47 | ServerBindAddress: ":8081", 48 | ProbeAddress: ":8080", 49 | MetricAddress: ":8443", 50 | } 51 | } 52 | 53 | func (o *CreateOptions) AddFlags(cmd *cobra.Command) { 54 | cmd.Flags().StringVarP(&o.ServerBindAddress, "server-bind-address", "s", o.ServerBindAddress, 55 | "The address for server to bind to.") 56 | cmd.Flags().StringVarP(&o.ProbeAddress, "probe-address", "p", o.ProbeAddress, 57 | "The address for probe to connect to.") 58 | cmd.Flags().StringVar(&o.MetricAddress, "metric-addr", o.MetricAddress, "The address the metric endpoint binds to.") 59 | } 60 | 61 | // serverCmd represents the server command 62 | func NewCmdServer(genFlags *cmdcommon.GenericFlags) *cobra.Command { 63 | o := NewCreateOptions(genFlags) 64 | var serverCmd = &cobra.Command{ 65 | Use: "server", 66 | Short: "Command to run di-server ", 67 | Long: `Run di-server with specified configuration. 68 | 69 | Examples: 70 | # Start di-server with gpu allocation policy and bind address specified. 71 | di-orchestrator server -p :8080 -s :8081 72 | `, 73 | Run: func(cmd *cobra.Command, args []string) { 74 | cobra.CheckErr(runCommand(cmd, o)) 75 | }, 76 | } 77 | 78 | o.AddFlags(serverCmd) 79 | return serverCmd 80 | } 81 | 82 | var ( 83 | scheme = runtime.NewScheme() 84 | setupLog = ctrl.Log.WithName("setup") 85 | ) 86 | 87 | func init() { 88 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 89 | 90 | utilruntime.Must(div2alpha1.AddToScheme(scheme)) 91 | //+kubebuilder:scaffold:scheme 92 | } 93 | 94 | func runCommand(cmd *cobra.Command, options *CreateOptions) error { 95 | flag.Parse() 96 | logger := zap.New(zap.UseFlagOptions(options.GenericFlags.ZapOpts)) 97 | ctrl.SetLogger(logger) 98 | 99 | // set common config 100 | dicommon.SetServiceDomainName(options.ServiceDomainName) 101 | dicommon.SetDIServerURL(options.DIServerURL) 102 | 103 | config := ctrl.GetConfigOrDie() 104 | config.QPS = float32(options.QPS) 105 | config.Burst = options.Burst 106 | mgr, err := ctrl.NewManager(config, ctrl.Options{ 107 | Scheme: scheme, 108 | MetricsBindAddress: options.MetricAddress, 109 | HealthProbeBindAddress: options.ProbeAddress, 110 | }) 111 | if err != nil { 112 | setupLog.Error(err, "unable to start manager") 113 | return err 114 | } 115 | 116 | ctx := dicontext.NewContext(config, 117 | mgr.GetClient(), 118 | mgr.GetEventRecorderFor("di-server"), 119 | ctrl.Log.WithName("di-server")) 120 | processor := server.NewProcessor(ctx) 121 | diServer := server.NewDIServer(ctx, processor, options.ServerBindAddress) 122 | mgr.Add(diServer) 123 | 124 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 125 | setupLog.Error(err, "unable to set up health check") 126 | return err 127 | } 128 | if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 129 | setupLog.Error(err, "unable to set up ready check") 130 | return err 131 | } 132 | 133 | setupLog.Info("starting manager") 134 | if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 135 | setupLog.Error(err, "problem running manager") 136 | return err 137 | } 138 | return nil 139 | } 140 | -------------------------------------------------------------------------------- /pkg/allocator/allocator.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | "k8s.io/apimachinery/pkg/runtime" 9 | ctrl "sigs.k8s.io/controller-runtime" 10 | "sigs.k8s.io/controller-runtime/pkg/builder" 11 | "sigs.k8s.io/controller-runtime/pkg/client" 12 | "sigs.k8s.io/controller-runtime/pkg/source" 13 | 14 | alloctypes "opendilab.org/di-orchestrator/pkg/allocator/types" 15 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 16 | dicommon "opendilab.org/di-orchestrator/pkg/common" 17 | dicontext "opendilab.org/di-orchestrator/pkg/context" 18 | diutil "opendilab.org/di-orchestrator/pkg/utils" 19 | ) 20 | 21 | type Allocator struct { 22 | Scheme *runtime.Scheme 23 | ctx dicontext.Context 24 | policy alloctypes.FitPolicy 25 | scheduleDuration time.Duration 26 | last time.Time 27 | } 28 | 29 | func NewAllocator(scheme *runtime.Scheme, ctx dicontext.Context, policy alloctypes.FitPolicy, scheduleDuration time.Duration) *Allocator { 30 | return &Allocator{ 31 | Scheme: scheme, 32 | ctx: ctx, 33 | policy: policy, 34 | scheduleDuration: scheduleDuration, 35 | last: time.Now(), 36 | } 37 | } 38 | 39 | func (a *Allocator) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 40 | log := a.ctx.Log.WithName("Reconcile").WithValues("job", req.NamespacedName) 41 | if !a.needReconcile() { 42 | log.V(2).Info("skipped reconcile since scheduling duration not meet") 43 | return ctrl.Result{}, nil 44 | } 45 | a.updateLastTime() 46 | 47 | jobkey := req.NamespacedName 48 | job := &div2alpha1.DIJob{} 49 | if err := a.ctx.Get(ctx, jobkey, job); err != nil { 50 | return ctrl.Result{}, err 51 | } 52 | 53 | jobinfo, err := getJobInfo(job) 54 | if err != nil { 55 | log.Error(err, "get jobinfo failed") 56 | return ctrl.Result{}, err 57 | } 58 | nodes, err := a.ctx.ListNodes(ctx) 59 | if err != nil { 60 | log.Error(err, "list nodes failed") 61 | return ctrl.Result{}, err 62 | } 63 | 64 | nodeinfos, err := a.getNodeInfos(ctx, nodes) 65 | if err != nil { 66 | log.Error(err, "list nodeinfos failed") 67 | return ctrl.Result{}, err 68 | } 69 | log.V(2).Info("get", "nodeinfos", nodeinfos) 70 | jobinfos := map[string]alloctypes.JobInfo{ 71 | jobinfo.Key.String(): jobinfo, 72 | } 73 | prevAllocations := map[string]alloctypes.NodeList{} 74 | if err := a.allocateAll(jobinfos, nodeinfos, prevAllocations); err != nil { 75 | return ctrl.Result{}, err 76 | } 77 | return ctrl.Result{}, nil 78 | } 79 | 80 | // SetupWithManager sets up the controller with the Manager. 81 | func (a *Allocator) SetupWithManager(mgr ctrl.Manager) error { 82 | return ctrl.NewControllerManagedBy(mgr). 83 | For(&div2alpha1.DIJob{}). 84 | Watches( 85 | &source.Kind{Type: &div2alpha1.DIJob{}}, 86 | &dicommon.EventHandler{ 87 | OnCreateHandlers: []func(obj client.Object){ 88 | a.onJobAddHandler, 89 | }, 90 | }, 91 | builder.Predicates{}, 92 | ). 93 | Watches( 94 | &source.Kind{Type: &corev1.Node{}}, 95 | &dicommon.EventHandler{}, 96 | ). 97 | Watches( 98 | &source.Kind{Type: &corev1.Pod{}}, 99 | &dicommon.EventHandler{}, 100 | ). 101 | Complete(a) 102 | } 103 | 104 | // onJobAddHandler handle the event when a job is created. 105 | func (a *Allocator) onJobAddHandler(obj client.Object) { 106 | log := a.ctx.Log.WithName("onJobAddHandler").WithValues("job", diutil.NamespacedName(obj.GetNamespace(), obj.GetName())) 107 | job := obj.(*div2alpha1.DIJob) 108 | 109 | if err := a.allocate(context.Background(), job); err != nil { 110 | log.Error(err, "failed to allocate") 111 | } 112 | } 113 | 114 | // return true if time elapsed is almost greater than schedule duration. 115 | func (a *Allocator) needReconcile() bool { 116 | return (a.scheduleDuration - time.Since(a.last)) < time.Second 117 | } 118 | 119 | func (a *Allocator) updateLastTime() { 120 | a.last = time.Now() 121 | } 122 | 123 | func (a *Allocator) allocate(ctx context.Context, job *div2alpha1.DIJob) error { 124 | log := a.ctx.Log.WithName("allocate").WithValues("job", diutil.NamespacedName(job.Namespace, job.Name)) 125 | old := job.DeepCopy() 126 | // allocate job if preemptible, otherwise just update status.replicas 127 | if job.Spec.Preemptible { 128 | jobinfo, err := getJobInfo(job) 129 | if err != nil { 130 | log.Error(err, "get jobinfo failed") 131 | return err 132 | } 133 | nodes, err := a.ctx.ListNodes(ctx) 134 | if err != nil { 135 | return err 136 | } 137 | nodeinfos, err := a.getNodeInfos(ctx, nodes) 138 | if err != nil { 139 | return err 140 | } 141 | allocation, err := a.policy.Allocate(jobinfo, nodeinfos) 142 | if err != nil { 143 | return err 144 | } 145 | log.Info("successfully allocate", "allocation", allocation) 146 | if len(allocation) != 0 { 147 | job.Status.Allocation = allocation 148 | } 149 | } else { 150 | // TODO(liqingping): 进行初始分配 151 | // job.Status.Replicas = job.Spec.MinReplicas 152 | } 153 | 154 | if err := a.ctx.UpdateJobAllocationInCluster(ctx, old, job); err != nil { 155 | return err 156 | } 157 | return nil 158 | } 159 | 160 | func (a *Allocator) allocateAll(jobinfos map[string]alloctypes.JobInfo, nodeinfos map[string]*alloctypes.NodeInfo, prevAllocations map[string]alloctypes.NodeList) error { 161 | log := a.ctx.Log.WithName("allocateAll") 162 | allocations, err := a.policy.Optimize(jobinfos, nodeinfos, prevAllocations) 163 | if err != nil { 164 | return err 165 | } 166 | log.Info("successfully allocate all", "allocations", allocations) 167 | return nil 168 | } 169 | -------------------------------------------------------------------------------- /pkg/server/processor.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strconv" 7 | 8 | "github.com/gin-gonic/gin" 9 | k8serrors "k8s.io/apimachinery/pkg/api/errors" 10 | "k8s.io/apimachinery/pkg/types" 11 | 12 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 13 | dicommon "opendilab.org/di-orchestrator/pkg/common" 14 | dicontext "opendilab.org/di-orchestrator/pkg/context" 15 | servertypes "opendilab.org/di-orchestrator/pkg/server/types" 16 | diutil "opendilab.org/di-orchestrator/pkg/utils" 17 | ) 18 | 19 | type ProcessorInterface interface { 20 | GetReplicas(c *gin.Context) (servertypes.Object, error) 21 | AddReplicas(c *gin.Context) (servertypes.Object, error) 22 | DeleteReplicas(c *gin.Context) (servertypes.Object, error) 23 | PostProfilings(c *gin.Context) (servertypes.Object, error) 24 | } 25 | 26 | type processor struct { 27 | ctx dicontext.Context 28 | } 29 | 30 | func NewProcessor(ctx dicontext.Context) ProcessorInterface { 31 | return &processor{ 32 | ctx: ctx, 33 | } 34 | } 35 | 36 | func (p *processor) GetReplicas(c *gin.Context) (servertypes.Object, error) { 37 | // get request params from request 38 | job, err := p.getRequestJob(c) 39 | if err != nil { 40 | return nil, err 41 | } 42 | log := p.ctx.Log.WithName("processor.GetReplicas").WithValues("job", diutil.NamespacedName(job.Namespace, job.Name)) 43 | 44 | reps, err := p.getNamespacedReplicas(context.Background(), job) 45 | if err != nil { 46 | return nil, err 47 | } 48 | 49 | log.Info("successfully get replicas", "replicas", reps) 50 | return reps, nil 51 | } 52 | func (p *processor) AddReplicas(c *gin.Context) (servertypes.Object, error) { 53 | // get request params from request 54 | job, err := p.getRequestJob(c) 55 | if err != nil { 56 | return nil, err 57 | } 58 | log := p.ctx.Log.WithName("processor.AddReplicas").WithValues("job", diutil.NamespacedName(job.Namespace, job.Name)) 59 | 60 | var reqs servertypes.DIJobRequest 61 | if err = c.ShouldBindJSON(&reqs); err != nil { 62 | return nil, servertypes.NewBadRequestError(err) 63 | } 64 | // add replicas 65 | if !job.Spec.Preemptible { 66 | old := job.DeepCopy() 67 | job.Status.Replicas += int32(reqs.Replicas) 68 | if err := p.ctx.UpdateJobAllocationInCluster(context.Background(), old, job); err != nil { 69 | log.Error(err, "update job status") 70 | return nil, err 71 | } 72 | } 73 | log.Info("successfully add replicas", "number", reqs.Replicas) 74 | return nil, nil 75 | } 76 | func (p *processor) DeleteReplicas(c *gin.Context) (servertypes.Object, error) { 77 | // get request body 78 | job, err := p.getRequestJob(c) 79 | if err != nil { 80 | return nil, servertypes.NewBadRequestError(err) 81 | } 82 | log := p.ctx.Log.WithName("processor.DeleteReplicas").WithValues("job", diutil.NamespacedName(job.Namespace, job.Name)) 83 | 84 | var reqs servertypes.DIJobRequest 85 | if err = c.ShouldBindJSON(&reqs); err != nil { 86 | return nil, servertypes.NewBadRequestError(err) 87 | } 88 | 89 | // delete replicas 90 | if !job.Spec.Preemptible { 91 | old := job.DeepCopy() 92 | job.Status.Replicas -= int32(reqs.Replicas) 93 | if err := p.ctx.UpdateJobAllocationInCluster(context.Background(), old, job); err != nil { 94 | log.Error(err, "update job status") 95 | return nil, err 96 | } 97 | } 98 | log.Info("successfully delete replicas", "number", reqs.Replicas) 99 | return nil, nil 100 | } 101 | func (p *processor) PostProfilings(c *gin.Context) (servertypes.Object, error) { 102 | // get request body 103 | job, err := p.getRequestJob(c) 104 | if err != nil { 105 | return nil, servertypes.NewBadRequestError(err) 106 | } 107 | log := p.ctx.Log.WithName("processor.PostProfilings").WithValues("job", diutil.NamespacedName(job.Namespace, job.Name)) 108 | 109 | var reqs div2alpha1.Profilings 110 | if err = c.ShouldBindJSON(&reqs); err != nil { 111 | return nil, servertypes.NewBadRequestError(err) 112 | } 113 | 114 | old := job.DeepCopy() 115 | job.Status.Profilings = reqs 116 | if err := p.ctx.UpdateJobProfilingsInCluster(context.Background(), old, job); err != nil { 117 | log.Error(err, "update job status") 118 | return nil, err 119 | } 120 | log.Info("successfully report profilings", "profilings", reqs) 121 | return nil, nil 122 | } 123 | 124 | func (p *processor) getRequestJob(c *gin.Context) (*div2alpha1.DIJob, error) { 125 | rawID := c.Param("id") 126 | namespace, name, err := parseJobID(rawID) 127 | if err != nil { 128 | return nil, servertypes.NewBadRequestError(err) 129 | } 130 | 131 | job := &div2alpha1.DIJob{} 132 | key := types.NamespacedName{Namespace: namespace, Name: name} 133 | err = p.ctx.Get(context.Background(), key, job) 134 | if err != nil { 135 | if k8serrors.IsNotFound(err) { 136 | return nil, servertypes.NewNotFoundError(err) 137 | } 138 | return nil, err 139 | } 140 | return job, nil 141 | } 142 | 143 | func (p *processor) getNamespacedReplicas(ctx context.Context, job *div2alpha1.DIJob) ([]string, error) { 144 | // list pods that belong to the DIJob 145 | pods, err := p.ctx.ListJobPods(ctx, job) 146 | if err != nil { 147 | return nil, err 148 | } 149 | 150 | // get access urls 151 | var urls []string 152 | for _, pod := range pods { 153 | if pod.Status.PodIP == "" { 154 | continue 155 | } 156 | replicas, _ := strconv.Atoi(pod.Annotations[dicommon.AnnotationReplicas]) 157 | rank, _ := strconv.Atoi(pod.Annotations[dicommon.AnnotationRank]) 158 | if urls == nil { 159 | urls = make([]string, replicas) 160 | } 161 | port, found := diutil.GetDefaultPortFromPod(pod) 162 | if !found { 163 | port = dicommon.DefaultPort 164 | } 165 | podIP := pod.Status.PodIP 166 | url := fmt.Sprintf("%s:%d", podIP, port) 167 | urls[rank] = url 168 | } 169 | return urls, nil 170 | } 171 | -------------------------------------------------------------------------------- /pkg/server/suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package server 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "net" 23 | "path/filepath" 24 | "strconv" 25 | "testing" 26 | "time" 27 | 28 | . "github.com/onsi/ginkgo" 29 | "github.com/onsi/ginkgo/config" 30 | . "github.com/onsi/gomega" 31 | corev1 "k8s.io/api/core/v1" 32 | "k8s.io/apimachinery/pkg/api/resource" 33 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 | "k8s.io/apimachinery/pkg/runtime" 35 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 36 | ctrl "sigs.k8s.io/controller-runtime" 37 | "sigs.k8s.io/controller-runtime/pkg/client" 38 | "sigs.k8s.io/controller-runtime/pkg/envtest" 39 | "sigs.k8s.io/controller-runtime/pkg/envtest/printer" 40 | logf "sigs.k8s.io/controller-runtime/pkg/log" 41 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 42 | 43 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 44 | dicontext "opendilab.org/di-orchestrator/pkg/context" 45 | //+kubebuilder:scaffold:imports 46 | ) 47 | 48 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 49 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 50 | 51 | const ( 52 | timeout = 5 * time.Second 53 | interval = 250 * time.Millisecond 54 | // duration = 500 * time.Millisecond 55 | 56 | localServingHost = "localhost" 57 | port = 8150 58 | ) 59 | 60 | // var cfg *rest.Config 61 | var ( 62 | k8sClient client.Client 63 | testEnv *envtest.Environment 64 | localServingPort = port 65 | scheme = runtime.NewScheme() 66 | ) 67 | 68 | func TestServer(t *testing.T) { 69 | RegisterFailHandler(Fail) 70 | 71 | RunSpecsWithDefaultAndCustomReporters(t, 72 | "DI-Server Suite", 73 | []Reporter{printer.NewlineReporter{}}) 74 | } 75 | 76 | var _ = BeforeSuite(func() { 77 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 78 | 79 | By("bootstrapping test environment") 80 | testEnv = &envtest.Environment{ 81 | CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, 82 | ErrorIfCRDPathMissing: true, 83 | } 84 | 85 | cfg, err := testEnv.Start() 86 | Expect(err).NotTo(HaveOccurred()) 87 | Expect(cfg).NotTo(BeNil()) 88 | 89 | err = div2alpha1.AddToScheme(scheme) 90 | Expect(err).NotTo(HaveOccurred()) 91 | err = clientgoscheme.AddToScheme(scheme) 92 | Expect(err).NotTo(HaveOccurred()) 93 | 94 | //+kubebuilder:scaffold:scheme 95 | 96 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) 97 | Expect(err).NotTo(HaveOccurred()) 98 | Expect(k8sClient).NotTo(BeNil()) 99 | 100 | var nodes []*corev1.Node 101 | nodes = append(nodes, newNode(fmt.Sprintf("server-test-%d", 0), 8), newNode(fmt.Sprintf("server-test-%d", 1), 8)) 102 | nodes = append(nodes, newNode(fmt.Sprintf("server-test-%d", 2), 0), newNode(fmt.Sprintf("server-test-%d", 3), 4)) 103 | 104 | for _, node := range nodes { 105 | err := k8sClient.Create(context.Background(), node, &client.CreateOptions{}) 106 | Expect(err).NotTo(HaveOccurred()) 107 | } 108 | 109 | var nodeList corev1.NodeList 110 | err = k8sClient.List(context.Background(), &nodeList, &client.ListOptions{}) 111 | Expect(err).NotTo(HaveOccurred()) 112 | for _, node := range nodeList.Items { 113 | fmt.Printf("node: %s added to cluster\n", node.Name) 114 | } 115 | 116 | metricPort := config.GinkgoConfig.ParallelNode + 8200 117 | metricAddress := fmt.Sprintf(":%d", metricPort) 118 | mgr, err := ctrl.NewManager(cfg, ctrl.Options{ 119 | Scheme: scheme, 120 | MetricsBindAddress: metricAddress, 121 | }) 122 | Expect(err).NotTo(HaveOccurred()) 123 | 124 | ctx := dicontext.NewContext(cfg, 125 | mgr.GetClient(), 126 | mgr.GetEventRecorderFor("di-server"), 127 | ctrl.Log.WithName("di-server")) 128 | 129 | processor := NewProcessor(ctx) 130 | localServingPort = port + config.GinkgoConfig.ParallelNode 131 | addrPort := fmt.Sprintf("%s:%d", localServingHost, localServingPort) 132 | go func() { 133 | diServer := NewDIServer(ctx, processor, addrPort) 134 | mgr.Add(diServer) 135 | err := mgr.Start(ctrl.SetupSignalHandler()) 136 | fmt.Println(err.Error()) 137 | }() 138 | 139 | // wait for the server to get ready 140 | tcpAddr, err := net.ResolveTCPAddr("tcp", addrPort) 141 | Expect(err).NotTo(HaveOccurred()) 142 | 143 | Eventually(func() error { 144 | conn, err := net.DialTCP("tcp", nil, tcpAddr) 145 | if err != nil { 146 | return err 147 | } 148 | conn.Close() 149 | return nil 150 | }, timeout, interval).Should(Succeed()) 151 | }, 60) 152 | 153 | var _ = AfterSuite(func() { 154 | By("tearing down the test environment") 155 | err := testEnv.Stop() 156 | Expect(err).NotTo(HaveOccurred()) 157 | }) 158 | 159 | func newNode(name string, gpus int) *corev1.Node { 160 | return &corev1.Node{ 161 | TypeMeta: metav1.TypeMeta{ 162 | APIVersion: "v1", 163 | Kind: "Node", 164 | }, 165 | ObjectMeta: metav1.ObjectMeta{ 166 | Name: name, 167 | }, 168 | Status: corev1.NodeStatus{ 169 | Allocatable: corev1.ResourceList{ 170 | "nvidia.com/gpu": resource.MustParse(strconv.Itoa(gpus)), 171 | corev1.ResourceCPU: resource.MustParse("32"), 172 | corev1.ResourceMemory: resource.MustParse("128Gi"), 173 | }, 174 | }, 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /cmd/operator/operator.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package operator 17 | 18 | import ( 19 | "flag" 20 | "time" 21 | 22 | "github.com/spf13/cobra" 23 | "k8s.io/apimachinery/pkg/runtime" 24 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 25 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 26 | ctrl "sigs.k8s.io/controller-runtime" 27 | "sigs.k8s.io/controller-runtime/pkg/healthz" 28 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 29 | 30 | cmdcommon "opendilab.org/di-orchestrator/cmd/common" 31 | alloc "opendilab.org/di-orchestrator/pkg/allocator" 32 | alloctypes "opendilab.org/di-orchestrator/pkg/allocator/types" 33 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 34 | dicommon "opendilab.org/di-orchestrator/pkg/common" 35 | dicontext "opendilab.org/di-orchestrator/pkg/context" 36 | "opendilab.org/di-orchestrator/pkg/controllers" 37 | ) 38 | 39 | type CreateOptions struct { 40 | *cmdcommon.GenericFlags 41 | 42 | SyncPeriod *time.Duration 43 | MetricAddress string 44 | ProbeAddress string 45 | EnableLeaderElection bool 46 | } 47 | 48 | func NewCreateOptions(genFlags *cmdcommon.GenericFlags) *CreateOptions { 49 | DefaultSyncPeriod := 1 * time.Minute 50 | DefaultMetricAddress := ":8443" 51 | DefaultProbeAddress := ":8080" 52 | DefaultEnableLeaderElection := false 53 | return &CreateOptions{ 54 | GenericFlags: genFlags, 55 | SyncPeriod: &DefaultSyncPeriod, 56 | MetricAddress: DefaultMetricAddress, 57 | ProbeAddress: DefaultProbeAddress, 58 | EnableLeaderElection: DefaultEnableLeaderElection, 59 | } 60 | } 61 | 62 | func (o *CreateOptions) AddFlags(cmd *cobra.Command) { 63 | cmd.Flags().DurationVar(o.SyncPeriod, "sync-period", *o.SyncPeriod, "Resync period for controllers.") 64 | cmd.Flags().StringVar(&o.MetricAddress, "metric-addr", o.MetricAddress, "The address the metric endpoint binds to.") 65 | cmd.Flags().StringVar(&o.ProbeAddress, "probe-addr", o.ProbeAddress, "The address the probe endpoint binds to.") 66 | cmd.Flags().BoolVar(&o.EnableLeaderElection, "leader-elect", o.EnableLeaderElection, 67 | "Enable leader election for controller manager. "+ 68 | "Enabling this will ensure there is only one active controller manager.") 69 | } 70 | 71 | func NewCmdOperator(genFlags *cmdcommon.GenericFlags) *cobra.Command { 72 | o := NewCreateOptions(genFlags) 73 | var operatorCmd = &cobra.Command{ 74 | Use: "operator", 75 | Short: "Command to run di-operator ", 76 | Long: `Run di-operator with specified configuration. 77 | 78 | Examples: 79 | # Start di-operator with metric address and probe address specified. 80 | di-orchestrator operator --metric-addr :8443 --probe-addr :8080 81 | `, 82 | Run: func(cmd *cobra.Command, args []string) { 83 | cobra.CheckErr(runCommand(cmd, o)) 84 | }, 85 | } 86 | 87 | o.AddFlags(operatorCmd) 88 | return operatorCmd 89 | } 90 | 91 | var ( 92 | scheme = runtime.NewScheme() 93 | setupLog = ctrl.Log.WithName("setup") 94 | ) 95 | 96 | func init() { 97 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 98 | 99 | utilruntime.Must(div2alpha1.AddToScheme(scheme)) 100 | //+kubebuilder:scaffold:scheme 101 | } 102 | 103 | func runCommand(cmd *cobra.Command, options *CreateOptions) error { 104 | flag.Parse() 105 | logger := zap.New(zap.UseFlagOptions(options.GenericFlags.ZapOpts)) 106 | ctrl.SetLogger(logger) 107 | 108 | // set common config 109 | dicommon.SetServiceDomainName(options.ServiceDomainName) 110 | dicommon.SetDIServerURL(options.DIServerURL) 111 | 112 | config := ctrl.GetConfigOrDie() 113 | config.QPS = float32(options.QPS) 114 | config.Burst = options.Burst 115 | mgr, err := ctrl.NewManager(config, ctrl.Options{ 116 | Scheme: scheme, 117 | SyncPeriod: options.SyncPeriod, 118 | MetricsBindAddress: options.MetricAddress, 119 | HealthProbeBindAddress: options.ProbeAddress, 120 | LeaderElection: options.EnableLeaderElection, 121 | LeaderElectionID: "12841a5d.opendilab.org", 122 | }) 123 | if err != nil { 124 | setupLog.Error(err, "unable to start manager") 125 | return err 126 | } 127 | 128 | ctx := dicontext.NewContext(config, 129 | mgr.GetClient(), 130 | mgr.GetEventRecorderFor("di-operator"), 131 | ctrl.Log.WithName("di-operator")) 132 | reconciler := controllers.NewDIJobReconciler(mgr.GetScheme(), ctx) 133 | if err = reconciler.SetupWithManager(mgr); err != nil { 134 | setupLog.Error(err, "unable to create controller", "controller", "DIJob") 135 | return err 136 | } 137 | 138 | ctx = dicontext.NewContext(config, 139 | mgr.GetClient(), 140 | mgr.GetEventRecorderFor("di-allocator"), 141 | ctrl.Log.WithName("di-allocator")) 142 | allocator := alloc.NewAllocator(mgr.GetScheme(), ctx, *alloctypes.NewFitPolicy(), *options.SyncPeriod) 143 | if err = allocator.SetupWithManager(mgr); err != nil { 144 | setupLog.Error(err, "unable to create allocator", "allocator", "DIJob") 145 | return err 146 | } 147 | 148 | //+kubebuilder:scaffold:builder 149 | 150 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 151 | setupLog.Error(err, "unable to set up health check") 152 | return err 153 | } 154 | if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 155 | setupLog.Error(err, "unable to set up ready check") 156 | return err 157 | } 158 | 159 | setupLog.Info("starting manager") 160 | if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 161 | setupLog.Error(err, "problem running manager") 162 | return err 163 | } 164 | return nil 165 | } 166 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # di-operator version 3 | APP_VERSION ?= 0.1.0 4 | VERSION ?= v1.1.3 5 | MASTER_VERSION := $(VERSION) 6 | 7 | COMMIT_SHORT_SHA=$(shell git log -n 1 | head -n 1 | sed -e 's/^commit //' | head -c 8) 8 | 9 | VERSION := $(VERSION)-${COMMIT_SHORT_SHA} 10 | 11 | ifeq ($(GIT_BRANCH),master) 12 | VERSION := $(MASTER_VERSION) 13 | endif 14 | 15 | ifneq ($(findstring release,$(GIT_BRANCH)),) 16 | VERSION := $(MASTER_VERSION) 17 | endif 18 | 19 | # Image URL to use all building/pushing image targets 20 | IMG_BASE ?= opendilab/di-orchestrator 21 | 22 | IMG ?= ${IMG_BASE}:${VERSION} 23 | MASTER_IMG ?= ${IMG_BASE}:${MASTER_VERSION} 24 | 25 | # Produce CRDs that work back to Kubernetes 1.11 (no version conversion) 26 | CRD_OPTIONS ?= "crd:trivialVersions=true,preserveUnknownFields=false" 27 | 28 | # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) 29 | ifeq (,$(shell go env GOBIN)) 30 | GOBIN=$(shell go env GOPATH)/bin 31 | else 32 | GOBIN=$(shell go env GOBIN) 33 | endif 34 | 35 | all: build 36 | 37 | ##@ General 38 | 39 | # The help target prints out all targets with their descriptions organized 40 | # beneath their categories. The categories are represented by '##@' and the 41 | # target descriptions by '##'. The awk commands is responsible for reading the 42 | # entire set of makefiles included in this invocation, looking for lines of the 43 | # file as xyz: ## something, and then pretty-format the target and help. Then, 44 | # if there's a line with ##@ something, that gets pretty-printed as a category. 45 | # More info on the usage of ANSI control characters for terminal formatting: 46 | # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters 47 | # More info on the awk command: 48 | # http://linuxcommand.org/lc3_adv_awk.php 49 | 50 | help: ## Display this help. 51 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 52 | 53 | ##@ Development 54 | 55 | manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. 56 | $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=di-operator-cluster-role webhook paths="./..." output:crd:artifacts:config=config/crd/bases 57 | cd config/manager && $(KUSTOMIZE) edit set image ${IMG_BASE}=${MASTER_IMG} 58 | ./hack/update-image-tags.sh config/manager ${MASTER_VERSION} 59 | ./hack/update-version.sh ${MASTER_VERSION} ${APP_VERSION} 60 | ## generate installer scripts 61 | $(KUSTOMIZE) build config/default > config/di-manager.yaml 62 | 63 | 64 | # dev-manifests will add COMMIT_SHORT_SHA to ci version, and image tag, so it is only used for development 65 | # used `make manifests` when commited git 66 | dev-manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. 67 | $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=di-operator-cluster-role webhook paths="./..." output:crd:artifacts:config=config/crd/bases 68 | cd config/manager && $(KUSTOMIZE) edit set image ${IMG_BASE}=${IMG} 69 | ./hack/update-image-tags.sh config/manager ${VERSION} 70 | ./hack/update-version.sh ${VERSION} ${APP_VERSION} 71 | $(KUSTOMIZE) build config/default > config/di-manager.yaml 72 | 73 | generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. 74 | $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." 75 | 76 | fmt: ## Run go fmt against code. 77 | go fmt ./... 78 | 79 | vet: ## Run go vet against code. 80 | go vet ./... 81 | 82 | # Run golangci-lint 83 | # lint: golangci-lint 84 | lint: 85 | golangci-lint run -v --timeout=5m 86 | 87 | .PHONY: test 88 | test: ginkgo ## Run tests. 89 | # $(GINKGO) -nodes 4 -v -cover -coverprofile=coverage.out ./pkg/... 90 | $(GINKGO) -r -cover -coverprofile=coverage.out ./pkg 91 | go tool cover -func=./pkg/server/coverage.out 92 | go tool cover -func=./pkg/common/coverage.out 93 | go tool cover -func=./pkg/controllers/coverage.out 94 | 95 | .PHONY: test-e2e 96 | test-e2e: ginkgo dev-deploy ## Run e2e tests 97 | ${GINKGO} -cover ./test/e2e 98 | 99 | ##@ Build 100 | 101 | build: generate ## Build di-operator binary. 102 | go build -o bin/di-orchestrator ./main.go 103 | 104 | docker-build: ## Build docker image with the di-operator. 105 | docker build -t ${IMG} -f Dockerfile . 106 | 107 | dev-images: build 108 | docker build -t ${IMG} -f Dockerfile.dev . 109 | 110 | docker-push: ## Push docker image with the di-operator. 111 | docker push ${IMG} 112 | 113 | docker-release: ## Release docker image with the di-operator. 114 | docker pull ${IMG} 115 | docker tag ${IMG} ${MASTER_IMG} 116 | docker push ${MASTER_IMG} 117 | 118 | ##@ Deployment 119 | 120 | install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. 121 | $(KUSTOMIZE) build config/crd | kubectl apply -f - 122 | 123 | uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. 124 | $(KUSTOMIZE) build config/crd | kubectl delete -f - 125 | 126 | deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. 127 | $(KUSTOMIZE) build config/default | kubectl apply -f - 128 | 129 | dev-deploy: dev-manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. 130 | $(KUSTOMIZE) build config/default | kubectl apply -f - 131 | 132 | undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. 133 | $(KUSTOMIZE) build config/default | kubectl delete -f - 134 | 135 | dev-undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. 136 | $(KUSTOMIZE) build config/default | kubectl delete -f - 137 | 138 | CONTROLLER_GEN = $(shell pwd)/bin/controller-gen 139 | controller-gen: ## Download controller-gen locally if necessary. 140 | $(call go-get-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen@v0.6.2) 141 | 142 | KUSTOMIZE = $(shell pwd)/bin/kustomize 143 | kustomize: ## Download kustomize locally if necessary. 144 | $(call go-get-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v3@v3.8.7) 145 | 146 | GINKGO = $(shell pwd)/bin/ginkgo 147 | ginkgo: ## Download ginkgo locally if necessary. 148 | $(call go-get-tool,$(GINKGO),github.com/onsi/ginkgo/ginkgo@v1.14.1) 149 | 150 | GOLANGCI_LINT = $(shell pwd)/bin/golangci-lint 151 | golangci-lint: ## Download golangci-lint locally if necessary. 152 | $(call go-get-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint@v1.46.2) 153 | 154 | # go-get-tool will 'go get' any package $2 and install it to $1. 155 | PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) 156 | define go-get-tool 157 | @[ -f $(1) ] || { \ 158 | set -e ;\ 159 | TMP_DIR=$$(mktemp -d) ;\ 160 | cd $$TMP_DIR ;\ 161 | go mod init tmp ;\ 162 | echo "Downloading $(2)" ;\ 163 | GOBIN=$(PROJECT_DIR)/bin go get $(2) ;\ 164 | rm -rf $$TMP_DIR ;\ 165 | } 166 | endef 167 | -------------------------------------------------------------------------------- /pkg/controllers/handler.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 8 | dicontext "opendilab.org/di-orchestrator/pkg/context" 9 | ) 10 | 11 | var ( 12 | jobStatusHandlers map[v2alpha1.Phase]map[v2alpha1.Phase][]func(ctx dicontext.Context, job *v2alpha1.DIJob) 13 | ) 14 | 15 | func init() { 16 | jobStatusHandlers = make(map[v2alpha1.Phase]map[v2alpha1.Phase][]func(ctx dicontext.Context, job *v2alpha1.DIJob)) 17 | registerJobStatusHandlers() 18 | } 19 | 20 | func registerJobStatusHandlers() { 21 | registerEach := func(old, new v2alpha1.Phase, handlers ...func(ctx dicontext.Context, job *v2alpha1.DIJob)) { 22 | if jobStatusHandlers[old] == nil { 23 | jobStatusHandlers[old] = make(map[v2alpha1.Phase][]func(ctx dicontext.Context, job *v2alpha1.DIJob)) 24 | } 25 | jobStatusHandlers[old][new] = handlers 26 | } 27 | 28 | registerEach(v2alpha1.JobPending, v2alpha1.JobPending, nothing) 29 | registerEach(v2alpha1.JobPending, v2alpha1.JobStarting, onJobStarting) 30 | registerEach(v2alpha1.JobPending, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 31 | registerEach(v2alpha1.JobPending, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 32 | registerEach(v2alpha1.JobPending, v2alpha1.JobRunning, onJobRunning) 33 | registerEach(v2alpha1.JobPending, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 34 | registerEach(v2alpha1.JobPending, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 35 | 36 | registerEach(v2alpha1.JobStarting, v2alpha1.JobPending, nothing) 37 | registerEach(v2alpha1.JobStarting, v2alpha1.JobStarting, nothing) 38 | registerEach(v2alpha1.JobStarting, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 39 | registerEach(v2alpha1.JobStarting, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 40 | registerEach(v2alpha1.JobStarting, v2alpha1.JobRunning, onJobRunning) 41 | registerEach(v2alpha1.JobStarting, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 42 | registerEach(v2alpha1.JobStarting, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 43 | 44 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobPending, nothing) 45 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobStarting, onJobStarting) 46 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobRestarting, nothing) 47 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 48 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobRunning, onJobRunning) 49 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 50 | registerEach(v2alpha1.JobRestarting, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 51 | 52 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobPending, nothing) 53 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobStarting, onJobStarting) 54 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 55 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobRescheduling, nothing) 56 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobRunning, onJobRunning) 57 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 58 | registerEach(v2alpha1.JobRescheduling, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 59 | 60 | registerEach(v2alpha1.JobRunning, v2alpha1.JobPending, nothing) 61 | registerEach(v2alpha1.JobRunning, v2alpha1.JobStarting, onJobStarting) 62 | registerEach(v2alpha1.JobRunning, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 63 | registerEach(v2alpha1.JobRunning, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 64 | registerEach(v2alpha1.JobRunning, v2alpha1.JobRunning, nothing) 65 | registerEach(v2alpha1.JobRunning, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 66 | registerEach(v2alpha1.JobRunning, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 67 | 68 | registerEach(v2alpha1.JobFailed, v2alpha1.JobPending, nothing) 69 | registerEach(v2alpha1.JobFailed, v2alpha1.JobStarting, onJobStarting) 70 | registerEach(v2alpha1.JobFailed, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 71 | registerEach(v2alpha1.JobFailed, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 72 | registerEach(v2alpha1.JobFailed, v2alpha1.JobRunning, onJobRunning) 73 | registerEach(v2alpha1.JobFailed, v2alpha1.JobFailed, nothing) 74 | registerEach(v2alpha1.JobFailed, v2alpha1.JobSucceeded, onJobSucceeded, updateReadyReplicas) 75 | 76 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobPending, nothing) 77 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobStarting, onJobStarting) 78 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobRestarting, onJobRestarting, increaseRestartCount) 79 | registerEach(v2alpha1.JobFailed, v2alpha1.JobRescheduling, onJobRescheduling, increaseRescheduleCount) 80 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobRunning, onJobRunning) 81 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobFailed, onJobFailed, updateReadyReplicas) 82 | registerEach(v2alpha1.JobSucceeded, v2alpha1.JobSucceeded, nothing) 83 | } 84 | 85 | func HandleJobStatus(ctx dicontext.Context, old, new *v2alpha1.DIJob) { 86 | for _, handler := range jobStatusHandlers[old.Status.Phase][new.Status.Phase] { 87 | handler(ctx, new) 88 | } 89 | } 90 | 91 | func nothing(ctx dicontext.Context, job *v2alpha1.DIJob) {} 92 | 93 | func updateReadyReplicas(ctx dicontext.Context, job *v2alpha1.DIJob) { 94 | job.Status.ReadyReplicas = 0 95 | } 96 | 97 | func increaseRestartCount(ctx dicontext.Context, job *v2alpha1.DIJob) { 98 | job.Status.Restarts++ 99 | } 100 | 101 | func increaseRescheduleCount(ctx dicontext.Context, job *v2alpha1.DIJob) { 102 | job.Status.Reschedules++ 103 | } 104 | 105 | func onJobStarting(ctx dicontext.Context, job *v2alpha1.DIJob) { 106 | msg := "job is starting since all replicas are created." 107 | ctx.Recorder.Eventf(job, corev1.EventTypeNormal, dicontext.DIJobStartingReason, msg) 108 | } 109 | 110 | func onJobRunning(ctx dicontext.Context, job *v2alpha1.DIJob) { 111 | msg := "job is running since all replicas are ready." 112 | ctx.Recorder.Eventf(job, corev1.EventTypeNormal, dicontext.DIJobRunningReason, msg) 113 | } 114 | 115 | func onJobRestarting(ctx dicontext.Context, job *v2alpha1.DIJob) { 116 | msg := "job is restarting since conditions changed." 117 | ctx.Recorder.Eventf(job, corev1.EventTypeWarning, dicontext.DIJobRestartingReason, msg) 118 | } 119 | 120 | func onJobRescheduling(ctx dicontext.Context, job *v2alpha1.DIJob) { 121 | msg := "job is rescheduling since replicas or allocation changed." 122 | ctx.Recorder.Eventf(job, corev1.EventTypeWarning, dicontext.DIJobReschedulingReason, msg) 123 | } 124 | 125 | func onJobFailed(ctx dicontext.Context, job *v2alpha1.DIJob) { 126 | msg := "job is failed since some replicas are failed." 127 | ctx.Recorder.Eventf(job, corev1.EventTypeWarning, dicontext.DIJobFailedReason, msg) 128 | deleteTime := metav1.Now() 129 | job.Status.CompletionTimestamp = &deleteTime 130 | } 131 | 132 | func onJobSucceeded(ctx dicontext.Context, job *v2alpha1.DIJob) { 133 | msg := "job is succeeded since all the replicas are succeeded." 134 | ctx.Recorder.Eventf(job, corev1.EventTypeNormal, dicontext.DIJobSucceededReason, msg) 135 | deleteTime := metav1.Now() 136 | job.Status.CompletionTimestamp = &deleteTime 137 | } 138 | -------------------------------------------------------------------------------- /pkg/api/v2alpha1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | // +build !ignore_autogenerated 2 | 3 | /* 4 | Copyright 2021 The OpenDILab authors. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | // Code generated by controller-gen. DO NOT EDIT. 20 | 21 | package v2alpha1 22 | 23 | import ( 24 | "k8s.io/api/core/v1" 25 | runtime "k8s.io/apimachinery/pkg/runtime" 26 | ) 27 | 28 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 29 | func (in *DIJob) DeepCopyInto(out *DIJob) { 30 | *out = *in 31 | out.TypeMeta = in.TypeMeta 32 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 33 | in.Spec.DeepCopyInto(&out.Spec) 34 | in.Status.DeepCopyInto(&out.Status) 35 | } 36 | 37 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DIJob. 38 | func (in *DIJob) DeepCopy() *DIJob { 39 | if in == nil { 40 | return nil 41 | } 42 | out := new(DIJob) 43 | in.DeepCopyInto(out) 44 | return out 45 | } 46 | 47 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 48 | func (in *DIJob) DeepCopyObject() runtime.Object { 49 | if c := in.DeepCopy(); c != nil { 50 | return c 51 | } 52 | return nil 53 | } 54 | 55 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 56 | func (in *DIJobList) DeepCopyInto(out *DIJobList) { 57 | *out = *in 58 | out.TypeMeta = in.TypeMeta 59 | in.ListMeta.DeepCopyInto(&out.ListMeta) 60 | if in.Items != nil { 61 | in, out := &in.Items, &out.Items 62 | *out = make([]DIJob, len(*in)) 63 | for i := range *in { 64 | (*in)[i].DeepCopyInto(&(*out)[i]) 65 | } 66 | } 67 | } 68 | 69 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DIJobList. 70 | func (in *DIJobList) DeepCopy() *DIJobList { 71 | if in == nil { 72 | return nil 73 | } 74 | out := new(DIJobList) 75 | in.DeepCopyInto(out) 76 | return out 77 | } 78 | 79 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 80 | func (in *DIJobList) DeepCopyObject() runtime.Object { 81 | if c := in.DeepCopy(); c != nil { 82 | return c 83 | } 84 | return nil 85 | } 86 | 87 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 88 | func (in *DIJobSpec) DeepCopyInto(out *DIJobSpec) { 89 | *out = *in 90 | if in.BackoffLimit != nil { 91 | in, out := &in.BackoffLimit, &out.BackoffLimit 92 | *out = new(int32) 93 | **out = **in 94 | } 95 | if in.Volumes != nil { 96 | in, out := &in.Volumes, &out.Volumes 97 | *out = make([]v1.Volume, len(*in)) 98 | for i := range *in { 99 | (*in)[i].DeepCopyInto(&(*out)[i]) 100 | } 101 | } 102 | if in.Tasks != nil { 103 | in, out := &in.Tasks, &out.Tasks 104 | *out = make([]Task, len(*in)) 105 | for i := range *in { 106 | (*in)[i].DeepCopyInto(&(*out)[i]) 107 | } 108 | } 109 | } 110 | 111 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DIJobSpec. 112 | func (in *DIJobSpec) DeepCopy() *DIJobSpec { 113 | if in == nil { 114 | return nil 115 | } 116 | out := new(DIJobSpec) 117 | in.DeepCopyInto(out) 118 | return out 119 | } 120 | 121 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 122 | func (in *DIJobStatus) DeepCopyInto(out *DIJobStatus) { 123 | *out = *in 124 | if in.CompletionTimestamp != nil { 125 | in, out := &in.CompletionTimestamp, &out.CompletionTimestamp 126 | *out = (*in).DeepCopy() 127 | } 128 | if in.TaskStatus != nil { 129 | in, out := &in.TaskStatus, &out.TaskStatus 130 | *out = make(map[string]TaskStatus, len(*in)) 131 | for key, val := range *in { 132 | var outVal map[v1.PodPhase]int32 133 | if val == nil { 134 | (*out)[key] = nil 135 | } else { 136 | in, out := &val, &outVal 137 | *out = make(TaskStatus, len(*in)) 138 | for key, val := range *in { 139 | (*out)[key] = val 140 | } 141 | } 142 | (*out)[key] = outVal 143 | } 144 | } 145 | if in.Allocation != nil { 146 | in, out := &in.Allocation, &out.Allocation 147 | *out = make([]string, len(*in)) 148 | copy(*out, *in) 149 | } 150 | out.Profilings = in.Profilings 151 | if in.Conditions != nil { 152 | in, out := &in.Conditions, &out.Conditions 153 | *out = make([]JobCondition, len(*in)) 154 | for i := range *in { 155 | (*in)[i].DeepCopyInto(&(*out)[i]) 156 | } 157 | } 158 | } 159 | 160 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DIJobStatus. 161 | func (in *DIJobStatus) DeepCopy() *DIJobStatus { 162 | if in == nil { 163 | return nil 164 | } 165 | out := new(DIJobStatus) 166 | in.DeepCopyInto(out) 167 | return out 168 | } 169 | 170 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 171 | func (in *JobCondition) DeepCopyInto(out *JobCondition) { 172 | *out = *in 173 | in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) 174 | in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) 175 | } 176 | 177 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobCondition. 178 | func (in *JobCondition) DeepCopy() *JobCondition { 179 | if in == nil { 180 | return nil 181 | } 182 | out := new(JobCondition) 183 | in.DeepCopyInto(out) 184 | return out 185 | } 186 | 187 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 188 | func (in *Profilings) DeepCopyInto(out *Profilings) { 189 | *out = *in 190 | } 191 | 192 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Profilings. 193 | func (in *Profilings) DeepCopy() *Profilings { 194 | if in == nil { 195 | return nil 196 | } 197 | out := new(Profilings) 198 | in.DeepCopyInto(out) 199 | return out 200 | } 201 | 202 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 203 | func (in *Task) DeepCopyInto(out *Task) { 204 | *out = *in 205 | in.Template.DeepCopyInto(&out.Template) 206 | } 207 | 208 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Task. 209 | func (in *Task) DeepCopy() *Task { 210 | if in == nil { 211 | return nil 212 | } 213 | out := new(Task) 214 | in.DeepCopyInto(out) 215 | return out 216 | } 217 | 218 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 219 | func (in TaskStatus) DeepCopyInto(out *TaskStatus) { 220 | { 221 | in := &in 222 | *out = make(TaskStatus, len(*in)) 223 | for key, val := range *in { 224 | (*out)[key] = val 225 | } 226 | } 227 | } 228 | 229 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TaskStatus. 230 | func (in TaskStatus) DeepCopy() TaskStatus { 231 | if in == nil { 232 | return nil 233 | } 234 | out := new(TaskStatus) 235 | in.DeepCopyInto(out) 236 | return *out 237 | } 238 | -------------------------------------------------------------------------------- /pkg/utils/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | "k8s.io/apimachinery/pkg/types" 11 | utilrand "k8s.io/apimachinery/pkg/util/rand" 12 | 13 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 14 | dicommon "opendilab.org/di-orchestrator/pkg/common" 15 | ) 16 | 17 | const ( 18 | randomLength = 5 19 | ) 20 | 21 | func Int32(i int32) *int32 { 22 | return &i 23 | } 24 | 25 | func Bool(i bool) *bool { 26 | return &i 27 | } 28 | 29 | func GenerateName(name string) string { 30 | return fmt.Sprintf("%s-%s", name, utilrand.String(randomLength)) 31 | } 32 | 33 | func NamespacedName(namespace, name string) string { 34 | return fmt.Sprintf("%s/%s", namespace, name) 35 | } 36 | 37 | func SplitNamespaceName(namespaceName string) (types.NamespacedName, error) { 38 | strs := strings.Split(namespaceName, "/") 39 | if len(strs) != 2 { 40 | return types.NamespacedName{}, fmt.Errorf("invalid namespace/name %s", namespaceName) 41 | } 42 | return types.NamespacedName{Namespace: strs[0], Name: strs[1]}, nil 43 | } 44 | 45 | func ReplicaName(jobName string, taskName string, rank int) string { 46 | return fmt.Sprintf("%s-%s-%d", jobName, taskName, rank) 47 | } 48 | 49 | func PodFQDN(name, subdomain, namespace, domainName string) string { 50 | return fmt.Sprintf("%s.%s.%s.%s", name, subdomain, namespace, domainName) 51 | } 52 | 53 | func IsSucceeded(job *div2alpha1.DIJob) bool { 54 | return job.Status.Phase == div2alpha1.JobSucceeded 55 | } 56 | 57 | func IsFailed(job *div2alpha1.DIJob) bool { 58 | return job.Status.Phase == div2alpha1.JobFailed 59 | } 60 | 61 | func GetObjectFromUnstructured(obj interface{}, dest interface{}) error { 62 | us, ok := obj.(*unstructured.Unstructured) 63 | if !ok { 64 | return fmt.Errorf("the object %s is not unstructured", obj) 65 | } 66 | err := runtime.DefaultUnstructuredConverter.FromUnstructured(us.UnstructuredContent(), dest) 67 | if err != nil { 68 | return err 69 | } 70 | 71 | return nil 72 | } 73 | 74 | func GetDefaultPortFromPod(pod *corev1.Pod) (int32, bool) { 75 | for _, c := range pod.Spec.Containers { 76 | if c.Name != dicommon.DefaultContainerName { 77 | continue 78 | } 79 | for _, port := range c.Ports { 80 | if port.Name == dicommon.DefaultPortName { 81 | return port.ContainerPort, true 82 | } 83 | } 84 | } 85 | return -1, false 86 | } 87 | 88 | func AddPortToPod(pod *corev1.Pod, port corev1.ContainerPort) { 89 | for i := range pod.Spec.Containers { 90 | if pod.Spec.Containers[i].Name != dicommon.DefaultContainerName { 91 | continue 92 | } 93 | if pod.Spec.Containers[i].Ports == nil { 94 | pod.Spec.Containers[i].Ports = []corev1.ContainerPort{} 95 | } 96 | pod.Spec.Containers[i].Ports = append(pod.Spec.Containers[i].Ports, port) 97 | } 98 | } 99 | 100 | func GenLabels(job div2alpha1.DIJob) map[string]string { 101 | return map[string]string{ 102 | dicommon.LabelJob: strings.Replace(job.Name, "/", "-", -1), 103 | dicommon.LabelOperator: dicommon.OperatorName, 104 | } 105 | } 106 | 107 | func AddLabelsToPod(pod *corev1.Pod, labels map[string]string) { 108 | if pod.ObjectMeta.Labels == nil { 109 | pod.ObjectMeta.Labels = make(map[string]string) 110 | } 111 | for k, v := range labels { 112 | pod.ObjectMeta.Labels[k] = v 113 | } 114 | } 115 | 116 | func AddAnnotationsToPod(pod *corev1.Pod, annotations map[string]string) { 117 | if pod.ObjectMeta.Annotations == nil { 118 | pod.ObjectMeta.Annotations = make(map[string]string) 119 | } 120 | for k, v := range annotations { 121 | pod.ObjectMeta.Annotations[k] = v 122 | } 123 | } 124 | 125 | func AddEnvsToPod(pod *corev1.Pod, envs map[string]string) { 126 | for i := range pod.Spec.Containers { 127 | if len(pod.Spec.Containers[i].Env) == 0 { 128 | pod.Spec.Containers[i].Env = make([]corev1.EnvVar, 0) 129 | } 130 | for k, v := range envs { 131 | env := corev1.EnvVar{Name: k, Value: v} 132 | pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, env) 133 | } 134 | } 135 | } 136 | 137 | func GetEnvFromPod(pod *corev1.Pod, envName string) (string, bool) { 138 | for _, container := range pod.Spec.Containers { 139 | if container.Name != dicommon.DefaultContainerName { 140 | continue 141 | } 142 | for _, env := range container.Env { 143 | if env.Name == envName { 144 | return env.Value, true 145 | } 146 | } 147 | } 148 | return "", false 149 | } 150 | 151 | func CountScheduledPods(pods []*corev1.Pod) int { 152 | count := 0 153 | for _, pod := range pods { 154 | for _, c := range pod.Status.Conditions { 155 | if c.Type == corev1.PodScheduled && c.Status == corev1.ConditionTrue { 156 | count++ 157 | } 158 | } 159 | } 160 | return count 161 | } 162 | 163 | func CountReadyPods(pods []*corev1.Pod) int { 164 | count := 0 165 | for _, pod := range pods { 166 | if IsPodTerminating(pod) { 167 | continue 168 | } 169 | for _, c := range pod.Status.ContainerStatuses { 170 | if c.Ready { 171 | count++ 172 | } 173 | } 174 | } 175 | return count 176 | } 177 | 178 | func CountCompletedPods(pods []*corev1.Pod, preemptible bool) (succeeded, failed int) { 179 | succeeded = 0 180 | failed = 0 181 | for _, pod := range pods { 182 | // replicas, _ := strconv.Atoi(pod.Annotations[dicommon.AnnotationReplicas]) 183 | // if replicas == len(pods) && diutil.IsPodSucceeded(pod) { 184 | if IsPodSucceeded(pod) { 185 | succeeded++ 186 | continue 187 | } 188 | if IsPodFailed(pod, preemptible) { 189 | failed++ 190 | } 191 | } 192 | return succeeded, failed 193 | } 194 | 195 | func SetPodResources(pod *corev1.Pod, resources corev1.ResourceRequirements) { 196 | for i := range pod.Spec.Containers { 197 | if pod.Spec.Containers[i].Name != dicommon.DefaultContainerName { 198 | continue 199 | } 200 | pod.Spec.Containers[i].Resources = resources 201 | } 202 | } 203 | 204 | func GetPodResources(spec *corev1.PodSpec) corev1.ResourceRequirements { 205 | for _, container := range spec.Containers { 206 | if container.Name != dicommon.DefaultContainerName { 207 | continue 208 | } 209 | return container.Resources 210 | } 211 | return corev1.ResourceRequirements{} 212 | } 213 | 214 | func FilterPods(pods []*corev1.Pod, filters Filters) []*corev1.Pod { 215 | results := []*corev1.Pod{} 216 | for _, pod := range pods { 217 | if filters.Apply(pod) { 218 | results = append(results, pod) 219 | } 220 | } 221 | 222 | return results 223 | } 224 | 225 | func SplitTypedPods(pods []*corev1.Pod) map[string][]*corev1.Pod { 226 | out := make(map[string][]*corev1.Pod) 227 | for _, pod := range pods { 228 | if _, ok := out[pod.Labels[dicommon.LabelTaskName]]; !ok { 229 | out[pod.Labels[dicommon.LabelTaskName]] = make([]*corev1.Pod, 0) 230 | } 231 | out[pod.Labels[dicommon.LabelTaskName]] = append(out[pod.Labels[dicommon.LabelTaskName]], pod) 232 | } 233 | return out 234 | } 235 | 236 | func IsPodTerminating(pod *corev1.Pod) bool { 237 | return pod.DeletionTimestamp != nil 238 | } 239 | 240 | func IsPodSucceeded(pod *corev1.Pod) bool { 241 | return pod.Status.Phase == corev1.PodSucceeded 242 | } 243 | 244 | func IsPodFailed(pod *corev1.Pod, preemptible bool) bool { 245 | exit143 := func(pod *corev1.Pod) bool { 246 | if pod.Status.ContainerStatuses == nil { 247 | return false 248 | } 249 | for _, status := range pod.Status.ContainerStatuses { 250 | if status.State.Terminated != nil && status.State.Terminated.ExitCode == 143 { 251 | return true 252 | } 253 | } 254 | return false 255 | } 256 | 257 | if pod.Status.Phase != corev1.PodUnknown && pod.Status.Phase != corev1.PodFailed { 258 | return false 259 | } 260 | if pod.Status.Reason == "UnexpectedAdmissionError" { 261 | // log.Info(fmt.Sprintf("pod %s UnexpectedAdmissionError occurred, message: %s", pod.Name, pod.Status.Message)) 262 | return false 263 | } else if strings.HasPrefix(pod.Status.Reason, "Outof") { 264 | // log.Info(fmt.Sprintf("pod %s is %s on node %s", pod.Name, pod.Status.Reason, pod.Spec.NodeName)) 265 | return false 266 | } else if preemptible && exit143(pod) { 267 | // log.Info(fmt.Sprintf("pod %s is terminated intentionally", pod.Name)) 268 | return false 269 | } else if IsPodTerminating(pod) { 270 | // log.Info(fmt.Sprintf("pod %s has been deleted", pod.Name)) 271 | return false 272 | } 273 | return true 274 | } 275 | -------------------------------------------------------------------------------- /pkg/controllers/dijob_controller.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 The OpenDILab authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package controllers 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | 23 | corev1 "k8s.io/api/core/v1" 24 | "k8s.io/apimachinery/pkg/api/errors" 25 | "k8s.io/apimachinery/pkg/runtime" 26 | ctrl "sigs.k8s.io/controller-runtime" 27 | "sigs.k8s.io/controller-runtime/pkg/builder" 28 | "sigs.k8s.io/controller-runtime/pkg/client" 29 | "sigs.k8s.io/controller-runtime/pkg/handler" 30 | "sigs.k8s.io/controller-runtime/pkg/source" 31 | 32 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 33 | dicommon "opendilab.org/di-orchestrator/pkg/common" 34 | dicontext "opendilab.org/di-orchestrator/pkg/context" 35 | diutil "opendilab.org/di-orchestrator/pkg/utils" 36 | ) 37 | 38 | // DIJobReconciler reconciles a DIJob object 39 | type DIJobReconciler struct { 40 | Scheme *runtime.Scheme 41 | ctx dicontext.Context 42 | } 43 | 44 | func NewDIJobReconciler(scheme *runtime.Scheme, ctx dicontext.Context) *DIJobReconciler { 45 | return &DIJobReconciler{ 46 | Scheme: scheme, 47 | ctx: ctx, 48 | } 49 | } 50 | 51 | //+kubebuilder:rbac:groups=diengine.opendilab.org,resources=dijobs,verbs=get;list;watch;create;update;patch;delete 52 | //+kubebuilder:rbac:groups=diengine.opendilab.org,resources=dijobs/status,verbs=get;update;patch 53 | //+kubebuilder:rbac:groups=diengine.opendilab.org,resources=dijobs/finalizers,verbs=update 54 | //+kubebuilder:rbac:groups="",resources=pods;services;events,verbs=get;list;watch;create;update;patch;delete 55 | //+kubebuilder:rbac:groups="",resources=namespaces;nodes,verbs=get;list;watch 56 | 57 | // Reconcile is part of the main kubernetes reconciliation loop which aims to 58 | // move the current state of the cluster closer to the desired state. 59 | // TODO(user): Modify the Reconcile function to compare the state specified by 60 | // the DIJob object against the actual cluster state, and then 61 | // perform operations to make the cluster state reflect the state specified by 62 | // the user. 63 | // 64 | // For more details, check Reconcile and its Result here: 65 | // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.7.0/pkg/reconcile 66 | func (r *DIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 67 | log := r.ctx.Log.WithName("Reconcile").WithValues("job", req.NamespacedName) 68 | 69 | // get DIJob object 70 | job := &div2alpha1.DIJob{} 71 | err := r.ctx.Get(ctx, req.NamespacedName, job) 72 | if err != nil { 73 | if !errors.IsNotFound(err) { 74 | log.Error(err, "get job.") 75 | } 76 | return ctrl.Result{}, nil 77 | } 78 | 79 | // validate job 80 | validators := make(diutil.Validators, 0) 81 | validators = append(validators, diutil.TaskTypeNameValidator) 82 | // check the task without name and set default name with task.Type 83 | 84 | //find task without name 85 | r.ctx.SetDefaultJobNameInCluster(ctx, job) 86 | 87 | if err := validators.Apply(job); err != nil { 88 | log.Error(err, "job validation.") 89 | old := job.DeepCopy() 90 | r.ctx.UpdateJobStatus(job, div2alpha1.JobFailed, dicontext.DIJobFailedReason, err.Error()) 91 | if err := r.ctx.UpdateJobPhaseAndConditionsInCluster(ctx, old, job); err != nil { 92 | log.Error(err, "update job phase and conditions.") 93 | } 94 | return ctrl.Result{}, nil 95 | } 96 | 97 | pods, err := r.ctx.ListJobPods(ctx, job) 98 | if err != nil { 99 | log.Error(err, "list pods.") 100 | return ctrl.Result{}, nil 101 | } 102 | 103 | services, err := r.ctx.ListJobServices(ctx, job) 104 | if err != nil { 105 | log.Error(err, "list services.") 106 | return ctrl.Result{}, nil 107 | } 108 | 109 | // check job phase 110 | if diutil.IsSucceeded(job) || diutil.IsFailed(job) { 111 | if err := r.ctx.DeletePodsAndServices(ctx, job, pods, services); err != nil { 112 | log.Error(err, "delete pods and services.") 113 | return ctrl.Result{}, nil 114 | } 115 | 116 | old := job.DeepCopy() 117 | job.Status.ReadyReplicas = 0 118 | job.Status.TaskStatus = nil 119 | if err := r.ctx.UpdateJobReplicaStatusInCluster(ctx, old, job); err != nil { 120 | log.Error(err, "update job replica status.") 121 | return ctrl.Result{}, nil 122 | } 123 | return ctrl.Result{}, nil 124 | } 125 | 126 | if err := r.reconcileReplicas(ctx, job, pods, services); err != nil { 127 | log.Error(err, "reconcile pods.") 128 | return ctrl.Result{}, nil 129 | } 130 | 131 | return ctrl.Result{}, nil 132 | } 133 | 134 | // SetupWithManager sets up the controller with the Manager. 135 | func (r *DIJobReconciler) SetupWithManager(mgr ctrl.Manager) error { 136 | return ctrl.NewControllerManagedBy(mgr). 137 | For(&div2alpha1.DIJob{}). 138 | Watches( 139 | &source.Kind{Type: &div2alpha1.DIJob{}}, 140 | &dicommon.EventHandler{ 141 | OnCreateHandlers: []func(obj client.Object){ 142 | r.onJobAddHandler, 143 | }, 144 | OnUpdateHandlers: []func(old, new client.Object){ 145 | r.onJobUpdateHandler, 146 | }, 147 | OnDeleteHandlers: []func(obj client.Object){ 148 | r.onJobDeleteHandler, 149 | }, 150 | }, 151 | builder.Predicates{}, 152 | ). 153 | Watches( 154 | &source.Kind{Type: &corev1.Pod{}}, 155 | &handler.EnqueueRequestForOwner{ 156 | IsController: true, 157 | OwnerType: &div2alpha1.DIJob{}, 158 | }, 159 | builder.Predicates{}, 160 | ). 161 | Watches( 162 | &source.Kind{Type: &corev1.Service{}}, 163 | &handler.EnqueueRequestForOwner{ 164 | IsController: true, 165 | OwnerType: &div2alpha1.DIJob{}, 166 | }, 167 | ). 168 | Complete(r) 169 | } 170 | 171 | // addDIJob is the event handler responsible for handling job add events 172 | func (r *DIJobReconciler) onJobAddHandler(obj client.Object) { 173 | jobkey := diutil.NamespacedName(obj.GetNamespace(), obj.GetName()) 174 | log := r.ctx.Log.WithName("onJobAddHandler").WithValues("job", jobkey) 175 | job, ok := obj.(*div2alpha1.DIJob) 176 | if !ok { 177 | log.Error(fmt.Errorf("convert object to dijob"), "") 178 | r.ctx.MarkIncorrectJobFailed(context.Background(), obj) 179 | return 180 | } 181 | old := job.DeepCopy() 182 | 183 | // update job status 184 | msg := "job created." 185 | if job.Status.Phase == "" || job.Status.Phase == div2alpha1.JobPending { 186 | r.ctx.UpdateJobStatus(job, div2alpha1.JobPending, dicontext.DIJobPendingReason, msg) 187 | r.ctx.Recorder.Eventf(job, corev1.EventTypeNormal, dicontext.DIJobPendingReason, msg) 188 | } 189 | 190 | if err := r.ctx.UpdateJobPhaseAndConditionsInCluster(context.Background(), old, job); err != nil { 191 | log.Error(err, "update job phase and conditions.") 192 | } 193 | } 194 | 195 | func (r *DIJobReconciler) onJobUpdateHandler(old, new client.Object) { 196 | jobkey := diutil.NamespacedName(old.GetNamespace(), old.GetName()) 197 | log := r.ctx.Log.WithName("onJobUpdateHandler").WithValues("job", jobkey) 198 | oldjob, ok := old.(*div2alpha1.DIJob) 199 | if !ok { 200 | log.Error(fmt.Errorf("convert object to dijob"), "") 201 | log.Error(fmt.Errorf("onvert object to dijob"), "") 202 | return 203 | } 204 | newjob, ok := new.(*div2alpha1.DIJob) 205 | if !ok { 206 | log.Error(fmt.Errorf("convert object to dijob"), "") 207 | return 208 | } 209 | stale := newjob.DeepCopy() 210 | 211 | HandleJobStatus(r.ctx, oldjob, newjob) 212 | if err := r.ctx.UpdateJobRestartsAndReschedulesInCluster(context.Background(), stale, newjob); err != nil { 213 | log.Error(err, "update job restarts and reschedules.") 214 | } 215 | if err := r.ctx.UpdateJobReplicaStatusInCluster(context.Background(), stale, newjob); err != nil { 216 | log.Error(err, "update job replica status.") 217 | } 218 | } 219 | 220 | func (r *DIJobReconciler) onJobDeleteHandler(obj client.Object) { 221 | jobkey := diutil.NamespacedName(obj.GetNamespace(), obj.GetName()) 222 | log := r.ctx.Log.WithName("onJobDeleteHandler").WithValues("job", jobkey) 223 | log.Info("job deleted.") 224 | } 225 | -------------------------------------------------------------------------------- /pkg/api/v2alpha1/dijob_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v2alpha1 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | ) 23 | 24 | // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! 25 | // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. 26 | 27 | // DIJobSpec defines the desired state of DIJob 28 | type DIJobSpec struct { 29 | // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster 30 | // Important: Run "make" to regenerate code after modifying this file 31 | 32 | // Priority labels the priority of DIJob. 33 | // +kubebuilder:default=normal 34 | // +kubebuilder:validation:Enum=normal;high 35 | Priority Priority `json:"priority,omitempty"` 36 | 37 | // CleanPodPolicy defines the policy to clean pods after DIJob completed. 38 | // +kubebuilder:default=Running 39 | // +kubebuilder:validation:Enum=Running;All;None 40 | CleanPodPolicy CleanPodPolicy `json:"cleanPodPolicy,omitempty"` 41 | 42 | // Preemptible defines whether the dijob can be preempted. 43 | // +kubebuilder:default=false 44 | Preemptible bool `json:"preemptible,omitempty"` 45 | 46 | // BackoffLimit defines the restart limit for DIJob. 47 | // +kubebuilder:default=3 48 | BackoffLimit *int32 `json:"backoffLimit,omitempty"` 49 | 50 | // Volumes defines the shared volumes for all tasks. 51 | Volumes []corev1.Volume `json:"volumes,omitempty"` 52 | 53 | // Provides flexible support for different components(collector, learner, evaluator) in DI-Engine 54 | // +kubebuilder:validation:Required 55 | Tasks []Task `json:"tasks"` 56 | } 57 | 58 | type Task struct { 59 | // Replicas defines the number of this task. 60 | // +kubebuilder:default=1 61 | // +kubebuilder:validation:Minimum=1 62 | Replicas int32 `json:"replicas,omitempty"` 63 | 64 | // TaskType defines the type of task 65 | // +kubebuilder:validation:Enum=learner;collector;evaluator;none 66 | // +kubebuilder:validation:Required 67 | Type TaskType `json:"type,omitempty"` 68 | 69 | // Name of the task specified. 70 | Name string `json:"name,omitempty"` 71 | 72 | // Template defines the pod template for DIJob. 73 | // +kubebuilder:validation:Required 74 | Template corev1.PodTemplateSpec `json:"template,omitempty"` 75 | } 76 | 77 | // TaskType defines the type of task 78 | type TaskType string 79 | 80 | const ( 81 | // TaskTypeLearner represents learner task 82 | TaskTypeLearner TaskType = "learner" 83 | 84 | // TaskTypeCollector represents evaluator task 85 | TaskTypeCollector TaskType = "collector" 86 | 87 | // TaskTypeEvaluator represents collector task 88 | TaskTypeEvaluator TaskType = "evaluator" 89 | 90 | // TaskTypeNone represents none task 91 | TaskTypeNone TaskType = "none" 92 | ) 93 | 94 | // Priority defines the priority of DIJob 95 | type Priority string 96 | 97 | const ( 98 | // PriorityNormal is normal priority 99 | PriorityNormal Priority = "normal" 100 | 101 | // PriorityHigh is high priority 102 | PriorityHigh Priority = "high" 103 | ) 104 | 105 | type CleanPodPolicy string 106 | 107 | const ( 108 | // CleanPodPolicyRunning means deleting all running pods of the job after completed 109 | CleanPodPolicyRunning CleanPodPolicy = "Running" 110 | 111 | // CleanPodPolicyAll means deleting all pods of the job after completed 112 | CleanPodPolicyAll CleanPodPolicy = "All" 113 | 114 | // CleanPodPolicyNone means never deleting any pods of the job after completed 115 | CleanPodPolicyNone CleanPodPolicy = "None" 116 | ) 117 | 118 | // DIJobStatus defines the observed state of DIJob 119 | type DIJobStatus struct { 120 | // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster 121 | // Important: Run "make" to regenerate code after modifying this file 122 | 123 | // CompletionTimestamp defines the timestamp when the job was completed 124 | CompletionTimestamp *metav1.Time `json:"completionTimestamp,omitempty"` 125 | 126 | // Restarts defines restart times of the job 127 | // +kubebuilder:default=0 128 | Restarts int32 `json:"restarts,omitempty"` 129 | 130 | // Reschedules defines reschedule times of the job 131 | // +kubebuilder:default=0 132 | Reschedules int32 `json:"reschedules,omitempty"` 133 | 134 | // Phase defines the observed phase of the job 135 | // +kubebuilder:default=Pending 136 | Phase Phase `json:"phase,omitempty"` 137 | 138 | // Replicas defines the observed number of replicas of the job 139 | // +kubebuilder:default=0 140 | Replicas int32 `json:"replicas,omitempty"` 141 | 142 | // TaskStatus defines running status of each task. map's key is task.name, value is TaskStatus 143 | TaskStatus map[string]TaskStatus `json:"taskStatus,omitempty"` 144 | 145 | // ReadyReplicas defines the observed number of ready replicas of the job 146 | // +kubebuilder:default=0 147 | ReadyReplicas int32 `json:"readyReplicas,omitempty"` 148 | 149 | // Allocation defines the replicas allocation of the job 150 | Allocation []string `json:"allocation,omitempty"` 151 | 152 | // Profilings defines the profiling data reported from DI-engine jobs 153 | Profilings Profilings `json:"profilings,omitempty"` 154 | 155 | // Conditions defines the conditions of the job 156 | Conditions []JobCondition `json:"conditions,omitempty"` 157 | } 158 | 159 | // Phase defines the phase of DIJob 160 | type Phase string 161 | 162 | const ( 163 | // JobPending means the job has been submitted to the cluster, 164 | // but not all the pods and services have been created 165 | JobPending Phase = "Pending" 166 | 167 | // JobStarted means the job has been created and waits for running. 168 | JobStarting Phase = "Starting" 169 | 170 | // JobRestarting means the job has been rescheduled and waits for restarting. 171 | JobRestarting Phase = "Restarting" 172 | 173 | // JobRescheduling means the job has been rescheduled and waits for restarting. 174 | JobRescheduling Phase = "Rescheduling" 175 | 176 | // JobRunning means all the pods are in running state 177 | JobRunning Phase = "Running" 178 | 179 | // JobSucceeded means job completed without error 180 | JobSucceeded Phase = "Succeeded" 181 | 182 | // JobFailed means some pods failed, job is also considered failed 183 | JobFailed Phase = "Failed" 184 | 185 | // JobUnknown means the job is in unknown state 186 | JobUnknown Phase = "Unknown" 187 | ) 188 | 189 | // Get the PodPhase from corev1 190 | type TaskStatus map[corev1.PodPhase]int32 191 | 192 | type Profilings struct{} 193 | 194 | // JobCondition records the conditions of DIJob 195 | type JobCondition struct { 196 | // Type of job condition. 197 | Type Phase `json:"type"` 198 | // Status of the condition, one of True, False, Unknown. 199 | Status corev1.ConditionStatus `json:"status"` 200 | // The reason for the condition's last transition. 201 | Reason string `json:"reason,omitempty"` 202 | // A human readable message indicating details about the transition. 203 | Message string `json:"message,omitempty"` 204 | // The last time this condition was updated. 205 | LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` 206 | // Last time the condition transitioned from one status to another. 207 | LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` 208 | } 209 | 210 | // +kubebuilder:object:root=true 211 | // +kubebuilder:subresource:status 212 | // +kubebuilder:resource:shortName=dijob 213 | // +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` 214 | // +kubebuilder:printcolumn:name="ReadyReplicas",type=integer,JSONPath=`.status.readyReplicas` 215 | // +kubebuilder:printcolumn:name="Replicas",type=integer,JSONPath=`.status.replicas` 216 | // +kubebuilder:printcolumn:name="Restarts",type=integer,JSONPath=`.status.restarts` 217 | // +kubebuilder:printcolumn:name="Reschedules",type=integer,JSONPath=`.status.reschedules` 218 | // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` 219 | // DIJob is the Schema for the dijobs API 220 | type DIJob struct { 221 | metav1.TypeMeta `json:",inline"` 222 | metav1.ObjectMeta `json:"metadata,omitempty"` 223 | 224 | Spec DIJobSpec `json:"spec,omitempty"` 225 | Status DIJobStatus `json:"status,omitempty"` 226 | } 227 | 228 | // +kubebuilder:object:root=true 229 | 230 | // DIJobList contains a list of DIJob 231 | type DIJobList struct { 232 | metav1.TypeMeta `json:",inline"` 233 | metav1.ListMeta `json:"metadata,omitempty"` 234 | Items []DIJob `json:"items"` 235 | } 236 | 237 | func init() { 238 | SchemeBuilder.Register(&DIJob{}, &DIJobList{}) 239 | } 240 | -------------------------------------------------------------------------------- /docs/architecture-cn.md: -------------------------------------------------------------------------------- 1 | # DI Orchestrator架构 2 | 3 | DI-engine框架v1版本分为3个重要的模块,分别是coordinator、collector和learner。对应DI Orchestrator v1版本。 4 | 5 | DI-engine框架v2版本将各个模块进行了整合,使得在同一个worker内可以完成完整的训练过程,当有新的worker加入时也能直接加入而无需重启。本文将针对DI-engine v2版本对DI Orchestrator v2版本进行详细描述。 6 | 7 | 有关DI-engine的详细介绍可参考[DI-engine developer tutorial](https://di-engine-docs.readthedocs.io/zh_CN/latest/),分布式相关的参考[DI-engine 分布式](https://di-engine-docs.readthedocs.io/zh_CN/latest/distributed/index_zh.html)。 8 | 9 | 为了提供DI-engine在Kubernetes(K8s)中运行的支持,我们设计了DI Orchestrator,本文将说明利用DI Orchestrator,DI-engine的组件在K8s系统上如何被创建、如何相互发现、如何开始训练等。DI Orchestrator的架构如下图所示: 10 | 11 | ![](images/di-engine-arch.png) 12 | 13 | 整体分为两大模块:`di-server`和 `di-operator`。本文将对这两大模块逐一进行介绍。 14 | 15 | ## DI Operator 16 | 17 | di-operator是负责在K8s系统中编排DIJob,采用K8s [operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/),通过[controller pattern](https://kubernetes.io/docs/concepts/architecture/controller/)中的控制循环监听K8s集群中DIJob的状态,并在有DIJob状态变更事件的时候对DIJob进行调谐,使得DIJob的实际状态与预期的状态尽可能保持一致。 18 | 19 | ### API定义 20 | 21 | 根据DI-engine框架的特性,我们利用K8s Custom Resource定义了DIJob资源,用来定义一个DI-engine强化学习(Reinforcement Learning,RL)任务运行所期望达成的状态,包括镜像、启动命令、挂载存储、任务类型和数目等。 22 | 23 | DIJobSpec中各字段定义及含义: 24 | 25 | ```go 26 | type DIJobSpec struct { 27 | // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster 28 | // Important: Run "make" to regenerate code after modifying this file 29 | 30 | // Priority labels the priority of DIJob. 31 | // +kubebuilder:default=normal 32 | // +kubebuilder:validation:Enum=normal;high 33 | Priority Priority `json:"priority,omitempty"` 34 | 35 | // CleanPodPolicy defines the policy to clean pods after DIJob completed. 36 | // +kubebuilder:default=Running 37 | // +kubebuilder:validation:Enum=Running;All;None 38 | CleanPodPolicy CleanPodPolicy `json:"cleanPodPolicy,omitempty"` 39 | 40 | // Preemptible defines whether the dijob can be preempted. 41 | // +kubebuilder:default=false 42 | Preemptible bool `json:"preemptible,omitempty"` 43 | 44 | // BackoffLimit defines the restart limit for DIJob. 45 | // +kubebuilder:default=3 46 | BackoffLimit *int32 `json:"backoffLimit,omitempty"` 47 | 48 | // Volumes defines the shared volumes for all tasks. 49 | Volumes []corev1.Volume `json:"volumes,omitempty"` 50 | 51 | // Provides flexible support for different components(collector, learner, evaluator) in DI-Engine 52 | // +kubebuilder:validation:Required 53 | Tasks []Task `json:"tasks"` 54 | } 55 | 56 | type Task struct { 57 | // Replicas defines the number of this task. 58 | // +kubebuilder:default=1 59 | // +kubebuilder:validation:Minimum=1 60 | Replicas int32 `json:"replicas,omitempty"` 61 | 62 | // TaskType defines the type of task 63 | // +kubebuilder:validation:Enum=learner;collector;evaluator;none 64 | // +kubebuilder:validation:Required 65 | Type TaskType `json:"type,omitempty"` 66 | 67 | // Name of the task specified. 68 | Name string `json:"name,omitempty"` 69 | 70 | // Template defines the pod template for DIJob. 71 | // +kubebuilder:validation:Required 72 | Template corev1.PodTemplateSpec `json:"template,omitempty"` 73 | } 74 | 75 | type TaskType string 76 | 77 | const ( 78 | // TaskTypeLearner represents learner task 79 | TaskTypeLearner TaskType = "learner" 80 | 81 | // TaskTypeCollector represents evaluator task 82 | TaskTypeCollector TaskType = "collector" 83 | 84 | // TaskTypeEvaluator represents collector task 85 | TaskTypeEvaluator TaskType = "evaluator" 86 | 87 | // TaskTypeNone represents none task 88 | TaskTypeNone TaskType = "none" 89 | ) 90 | ``` 91 | 92 | ### 状态定义 93 | 94 | 用户提交DIJob后,di-operator便接管了DIJob的生命周期的管理,我们定义了以下阶段(phase)便于用户了解DIJob的状态: 95 | 96 | ```go 97 | const ( 98 | // JobPending means the job has been submitted to the cluster, 99 | // but not all the pods and services have been created 100 | JobPending Phase = "Pending" 101 | 102 | // JobStarted means the job has been created and waits for running. 103 | JobStarting Phase = "Starting" 104 | 105 | // JobRestarting means the job has been rescheduled and waits for restarting. 106 | JobRestarting Phase = "Restarting" 107 | 108 | // JobRescheduling means the job has been rescheduled and waits for restarting. 109 | JobRescheduling Phase = "Rescheduling" 110 | 111 | // JobRunning means all the pods are in running state 112 | JobRunning Phase = "Running" 113 | 114 | // JobSucceeded means job completed without error 115 | JobSucceeded Phase = "Succeeded" 116 | 117 | // JobFailed means some pods failed, job is also considered failed 118 | JobFailed Phase = "Failed" 119 | 120 | // JobUnknown means the job is in unknown state 121 | JobUnknown Phase = "Unknown" 122 | ) 123 | ``` 124 | 125 | 一个正常运行并结束的DIJob会经历Pending、Starting、Running和Succeeded四个阶段,状态转移图如下图所示: 126 | ![](images/di-engine-status-machine.png) 127 | 128 | - 当DIJob提交后,进入Pending阶段。 129 | - 当di-operator将workers创建后,进入Starting状态。 130 | - 当所有workers都ready后,进入Running状态。 131 | - 当所有workers都Succeeded后,进入Succeeded状态。 132 | - 当有worker出现Failed,且未超过最大重启次数,进入Restarting状态。 133 | - 当有worker出现Failed,且超过最大重启次数,进入Failed状态。 134 | - 当DIJob被重调度或者workers数目与预期不符,进入Rescheduling状态。 135 | 136 | Unknown阶段暂时未作定义。 137 | 138 | ### 控制循环 139 | 借鉴自[Adaptdl](https://github.com/petuum/adaptdl),v2版本架构对Operator调谐逻辑进行了重构,将调度和调谐逻辑分别在Allocator和Controller中完成,使得组件分工更明确。 140 | #### Allocator控制循环 141 | Allocator为v2架构中新增的模块,用于调度DIJob,包括分配workers和放置workers。定义两个方法(allocate和allocateAll)用于对单任务和多任务进行调度。为了提供不同的调度策略,我们将调度策略定义为一个interface Policy,该interface中定义了两个方法分别是`Allocate`和`Optimize`,前者用于在任务提交时为该任务进行初始调度;后者用于对全局任务进行统一调度。 142 | Policy interface定义如下: 143 | 144 | ```go 145 | type Policy interface { 146 | Allocate(job JobInfo, nodes map[string]*NodeInfo) (NodeList, error) 147 | Optimize(jobs map[string]JobInfo, nodes map[string]*NodeInfo, prevAllocations map[string]NodeList) (map[string]NodeList, error) 148 | } 149 | ``` 150 | 用户可根据自身需求实现自己的调度算法。 151 | 152 | 当`job.spec.preemptible==false`时,Allocator将不会对该任务进行调度,只会根据`job.spec.tasks[].replicas`表示的所有tasks的replicas总和为该任务分配固定数目的workers,分配结果写到`job.status.replicas`。不过,用户可以通过修改`job.status.replicas`来变更该任务的workers数目。 153 | > Note:不能直接通过`kubectl apply`或者`kubectl edit`命令直接修改`job.status.replicas`,因为`job.status`被定义为SubResource,对于DIJob的所有的PUT和POST请求都会忽略`job.status`字段,见[Kubernetes API Conversion](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status)。可以执行`go run ./hack/update_replicas.go --ns [your-job-namespace] --n [your-job-name] --r [expected-replicas]`实现修改replicas的操作。 154 | #### Controller控制循环 155 | Controller控制循环用于调谐DIJob的状态,包括生命周期管理、workers的创建和删除等,如前文所述状态转移图。 156 | 157 | ## DI Server 158 | 159 | Server是一个为DI-engine框架定制的http服务器,提供新增、删除和查询workers的功能。Server利用[gin](https://github.com/gin-gonic/gin) web框架提供http服务能力 160 | 161 | 下面将对Server的设计进行简要介绍,包括用于动态新增、删除和查询workers的http接口以及用户汇报训练任务profilings数据的接口。 162 | 163 | ### http接口 164 | 165 | 为了支持DIJob动态增删workers,Server提供http接口用于对workers进行新增、删除和查询,提供如下接口: 166 | 167 | | method | path | description | 168 | | ------ | ------------------------------------------------ | ------------------------------------------------------------------------- | 169 | | GET | /v2alpha1/[job_id]/replicas | get job replicas | 170 | | DELETE | /v2alpha1/[job_id]/replicas | delete some replicas. put data in request body | 171 | | POST | /v2alpha1/[job_id]/replicas | create replicas. put data in request body | 172 | | POST | /v2alpha1/[job_id]/profilings | post job profiling data. put data in request body | 173 | 174 | job_id由`namespace.name.generation`三元组构成。 175 | - create和delete请求:Request Body="{"replicas": n}",Server读取Request Body中的replicas,直接修改`job.status.replicas`,真正的创建和删除操作由Operator完成。(注:Server只会对preemptible的DIJob进行操作) 176 | - get请求:Server查询DIJob的replicas,并将访问每个replicas的[ip:port]返回。 177 | - Post profilings请求:Request Body="{"data": {}}",Server读取Request Body中的data,将data patch到`job.status.profilings`中。 178 | 179 | ## 任务运行流程 180 | 181 | 用户提交的任务按照以下流程在集群中运行,由Allocator进行调度、Controller进行容器编排、Server进行任务profilings的汇报。 182 | ![](images/di-engine-schedule.png) 183 | 184 | 1. 用户提交DIJob到K8s集群中。 185 | 2. Allocator进行初始分配: 186 | 1. 对不允许抢占的job,由用户定义replicas数,allocator不做修改。 187 | 2. 对允许抢占的job,根据job task资源占用修改`job.status.allocation`的值,`job.status.allocation`是一个节点list,表示每个replicas放置的节点。(目前调度policy还未实现) 188 | 3. Controller获取K8s集群中job的变更。 189 | 4. Controller创建相应数目的replicas。 190 | 1. 对不允许抢占的job,根据`job.status.replicas`创建对应数目的replicas。 191 | 2. 对允许抢占的job,根据`job.status.allocation`创建对应数目的replicas,并为每个replicas指定在哪个节点运行。 192 | 5. replicas启动并开始训练,一段时间后将采集到的profilings数据汇报到Server端。 193 | 6. Server将profilings数目更新到`job.status.profilings`中。 194 | 7. 每个固定调度周期,Allocator重新调度所有jobs: 195 | 1. 对不允许抢占的jobs,这里不做重调度。 196 | 2. 对允许抢占的jobs,利用每个job的`job.status.profilings`并根据Allocator Policy中定义的调度策略进行全局调度,并修改每个jobs的`job.status.allocation`。 197 | 8. Controller获取K8s集群中jobs的变更。 198 | 9. Controller创建相应数目的replicas。 199 | 200 | ## DI Orchestrator的优势 201 | 202 | DI Orchestrator为DI-engine框架提供了分布式场景下基于K8s的容器运行方案。对于用户提交的DIJob,Operator负责对DI-engine的workers进行编排,使得各个worker可以正常运行并执行训练任务;通过子模块Allocator为DI-engine框架提供资源动态分配与调度的能力。通过调用Server的接口,赋予用户新增、删除和查询任务的workers的功能。总结DI Orchestrator提供了以下优势: 203 | 204 | 1. 封装性。依赖Operator的编排能力,部署DI-engine分布式RL训练的细节(包括pod创建、服务发现)对用户来说是透明的。根据DI-engine框架对分布式RL训练的部署需求,Operator为任务创建workers,Operator会把每个worker的状态记录到DIJob的状态中。DIJob的生命周期也由Operator维护,向用户展示DIJob在不同阶段的状态。 205 | 2. 易用性。用户只需要在DIJob的yaml文件中定义好任务的配置之后,一键提交到K8s集群即可,Operator将负责完成部署工作,将用户从K8s集群中复杂的分布式RL训练部署中解放出来。同时可以借助命令行工具一键提交DIJob。 206 | 3. 鲁棒性。依赖Operator的重启机制,保证workers在意外退出的情况下能自动重启。 207 | 4. 动态扩展。DIJob所需的workers是动态变化的,因此用户可以通过K8s client直接修改DIJob来更改workers数目;同时,Server提供了HTTP接口可以动态调整workers的数目。动态扩展使得用户可以根据自身需求调整workers数目,优化吞吐量。 208 | 5. 动态调度。依赖Operator子组件Allocator,针对DI-engine任务进行动态调度变得简单。Allocator提供了针对单任务和多任务的调度策略,可以在不影响正常训练的情况下优化全局任务完成时间。 209 | -------------------------------------------------------------------------------- /pkg/controllers/dijob_test.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | 8 | . "github.com/onsi/ginkgo" 9 | . "github.com/onsi/gomega" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/types" 12 | 13 | div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1" 14 | dicommon "opendilab.org/di-orchestrator/pkg/common" 15 | diutil "opendilab.org/di-orchestrator/pkg/utils" 16 | testutil "opendilab.org/di-orchestrator/pkg/utils/testutils" 17 | ) 18 | 19 | var _ = Describe("DIJob Specification", func() { 20 | 21 | Context("Test DIJob Specification", func() { 22 | It("Should execute different pods deletion policy with different CleanPodPolicy", func() { 23 | cleanPodPolicies := []div2alpha1.CleanPodPolicy{ 24 | div2alpha1.CleanPodPolicyAll, 25 | div2alpha1.CleanPodPolicyRunning, 26 | div2alpha1.CleanPodPolicyNone, 27 | } 28 | for _, policy := range cleanPodPolicies { 29 | type testCase struct { 30 | runnings int // pending pods are also considered as running pods 31 | replicaStatues [][]corev1.PodPhase 32 | } 33 | testCases := []testCase{ 34 | { 35 | runnings: 2, 36 | replicaStatues: [][]corev1.PodPhase{ 37 | {corev1.PodRunning}, {corev1.PodRunning}, {corev1.PodFailed, corev1.PodSucceeded}, 38 | }, 39 | }, 40 | { 41 | runnings: 0, 42 | replicaStatues: [][]corev1.PodPhase{ 43 | {corev1.PodFailed}, {corev1.PodFailed}, {corev1.PodFailed, corev1.PodSucceeded}, 44 | }, 45 | }, 46 | { 47 | runnings: 3, 48 | replicaStatues: [][]corev1.PodPhase{ 49 | {corev1.PodRunning}, {corev1.PodRunning}, {corev1.PodRunning, corev1.PodFailed}, 50 | }, 51 | }, 52 | { 53 | runnings: 0, 54 | replicaStatues: [][]corev1.PodPhase{ 55 | {corev1.PodFailed}, {corev1.PodFailed}, {corev1.PodFailed, corev1.PodFailed}, 56 | }, 57 | }, 58 | { 59 | runnings: 0, 60 | replicaStatues: [][]corev1.PodPhase{ 61 | {corev1.PodSucceeded}, {corev1.PodSucceeded}, {corev1.PodSucceeded, corev1.PodSucceeded}, 62 | }, 63 | }, 64 | } 65 | for i := range testCases { 66 | c := testCases[i] 67 | By(fmt.Sprintf("Create %dth DIJob", i+1)) 68 | var err error 69 | jobTmpl := testutil.NewDIJob() 70 | for i := range jobTmpl.Spec.Tasks { 71 | jobTmpl.Spec.Tasks[i].Replicas = int32(len(c.replicaStatues[i])) 72 | } 73 | totalReplicas := 0 74 | for _, task := range jobTmpl.Spec.Tasks { 75 | totalReplicas += int(task.Replicas) 76 | } 77 | jobTmpl.Spec.BackoffLimit = diutil.Int32(0) 78 | jobTmpl.Spec.CleanPodPolicy = policy 79 | job, _ := createAndUpdateReplicas(ctx, jobTmpl) 80 | 81 | By("Update workers status") 82 | for taskIndex, taskStatus := range c.replicaStatues { 83 | for podIndex, phase := range taskStatus { 84 | replicaName := diutil.ReplicaName(job.Name, job.Spec.Tasks[taskIndex].Name, podIndex) 85 | podKey := types.NamespacedName{Namespace: job.Namespace, Name: replicaName} 86 | err = testutil.UpdatePodPhase(ctx, podKey, phase) 87 | Expect(err).NotTo(HaveOccurred()) 88 | } 89 | } 90 | 91 | By("Get the number of pods") 92 | pods, err := ctx.ListJobPods(context.Background(), &job) 93 | Expect(err).NotTo(HaveOccurred()) 94 | npods := len(pods) 95 | 96 | By("Checking all the pods and services are deleted") 97 | 98 | switch policy { 99 | case div2alpha1.CleanPodPolicyAll: 100 | Eventually(func() int { 101 | pods, err := ctx.ListJobPods(context.Background(), &job) 102 | if err != nil { 103 | return -1 104 | } 105 | return len(pods) 106 | }, timeout, interval).Should(Equal(0)) 107 | Eventually(func() int { 108 | svcs, err := ctx.ListJobServices(context.Background(), &job) 109 | if err != nil { 110 | return -1 111 | } 112 | return len(svcs) 113 | }, timeout, interval).Should(Equal(0)) 114 | case div2alpha1.CleanPodPolicyNone: 115 | Consistently(func() int { 116 | pods, err := ctx.ListJobPods(context.Background(), &job) 117 | if err != nil { 118 | return -1 119 | } 120 | return len(pods) 121 | }, duration, interval).Should(Equal(npods)) 122 | Eventually(func() int { 123 | svcs, err := ctx.ListJobServices(context.Background(), &job) 124 | if err != nil { 125 | return -1 126 | } 127 | return len(svcs) 128 | }, timeout, interval).Should(Equal(0)) 129 | case div2alpha1.CleanPodPolicyRunning: 130 | Eventually(func() int { 131 | pods, err := ctx.ListJobPods(context.Background(), &job) 132 | if err != nil { 133 | return -1 134 | } 135 | return len(pods) 136 | }, timeout, interval).Should(Equal(npods - c.runnings)) 137 | Eventually(func() int { 138 | svcs, err := ctx.ListJobServices(context.Background(), &job) 139 | if err != nil { 140 | return -1 141 | } 142 | return len(svcs) 143 | }, timeout, interval).Should(Equal(0)) 144 | } 145 | 146 | By("Clean up pods") 147 | err = ctx.CleanUpJob(context.Background(), &job) 148 | Expect(err).NotTo(HaveOccurred()) 149 | } 150 | } 151 | }) 152 | It("Should create replicas with correct envs setted", func() { 153 | type testCase struct { 154 | replicas []int 155 | expectedEnvNodes int 156 | } 157 | testCases := []testCase{ 158 | {replicas: []int{1, 1, 1}, expectedEnvNodes: 9}, 159 | {replicas: []int{1, 1, 2}, expectedEnvNodes: 16}, 160 | {replicas: []int{1, 3, 2}, expectedEnvNodes: 36}, 161 | } 162 | for i := range testCases { 163 | c := testCases[i] 164 | By(fmt.Sprintf("Create %dth DIJob", i+1)) 165 | var err error 166 | jobTmpl := testutil.NewDIJob() 167 | for i := range jobTmpl.Spec.Tasks { 168 | jobTmpl.Spec.Tasks[i].Replicas = int32(c.replicas[i]) 169 | } 170 | totalReplicas := 0 171 | for _, task := range jobTmpl.Spec.Tasks { 172 | totalReplicas += int(task.Replicas) 173 | } 174 | 175 | job, jobKey := createAndUpdateReplicas(ctx, jobTmpl) 176 | 177 | By("Check the created DIJob is in Starting state") 178 | checkDIJobPhase(ctx, jobKey, div2alpha1.JobStarting) 179 | 180 | By("Check workers' attached nodes are as expected") 181 | Eventually(func() int { 182 | pods, err := ctx.ListJobPods(context.Background(), &job) 183 | if err != nil { 184 | return -1 185 | } 186 | nodes := 0 187 | for _, pod := range pods { 188 | for _, env := range pod.Spec.Containers[0].Env { 189 | if env.Name == dicommon.ENVNodes { 190 | if env.Value == "" { 191 | continue 192 | } 193 | nodes += len(strings.Split(env.Value, ",")) 194 | } 195 | } 196 | } 197 | return nodes 198 | }, timeout, interval).Should(Equal(c.expectedEnvNodes)) 199 | 200 | By("Clean up pods") 201 | err = ctx.CleanUpJob(context.Background(), &job) 202 | Expect(err).NotTo(HaveOccurred()) 203 | } 204 | }) 205 | 206 | It("Should have correct status", func() { 207 | type testCase struct { 208 | replicas []int 209 | expectedReplicas int 210 | } 211 | testCases := []testCase{ 212 | {replicas: []int{1, 1, 1}, expectedReplicas: 3}, 213 | {replicas: []int{1, 1, 2}, expectedReplicas: 4}, 214 | {replicas: []int{1, 3, 2}, expectedReplicas: 6}, 215 | } 216 | for i := range testCases { 217 | c := testCases[i] 218 | By(fmt.Sprintf("Create %dth DIJob", i+1)) 219 | var err error 220 | jobTmpl := testutil.NewDIJob() 221 | for i := range jobTmpl.Spec.Tasks { 222 | jobTmpl.Spec.Tasks[i].Replicas = int32(c.replicas[i]) 223 | } 224 | totalReplicas := 0 225 | for _, task := range jobTmpl.Spec.Tasks { 226 | totalReplicas += int(task.Replicas) 227 | } 228 | 229 | job, jobKey := createAndUpdateReplicas(ctx, jobTmpl) 230 | 231 | By("Check the created DIJob is in Starting state") 232 | checkDIJobPhase(ctx, jobKey, div2alpha1.JobStarting) 233 | 234 | By("Update workers to Running") 235 | err = updateWorkerPodsPhase(&job, corev1.PodRunning) 236 | Expect(err).NotTo(HaveOccurred()) 237 | 238 | By("Check status replicas are as expected") 239 | Eventually(func() int { 240 | err := ctx.Get(context.Background(), jobKey, &job) 241 | if err != nil { 242 | return -1 243 | } 244 | return int(job.Status.Replicas) 245 | }, timeout, interval).Should(Equal(c.expectedReplicas)) 246 | 247 | By("Check status taskStatus are as expected") 248 | Eventually(func() int { 249 | err := ctx.Get(context.Background(), jobKey, &job) 250 | if err != nil { 251 | return -1 252 | } 253 | count := 0 254 | for _, taskStatus := range job.Status.TaskStatus { 255 | count += int(taskStatus[corev1.PodRunning]) 256 | } 257 | return count 258 | }, timeout, interval).Should(Equal(c.expectedReplicas)) 259 | 260 | By("Clean up pods") 261 | err = ctx.CleanUpJob(context.Background(), &job) 262 | Expect(err).NotTo(HaveOccurred()) 263 | } 264 | }) 265 | 266 | It("Should have correct volumes when job volumes specified", func() { 267 | type testCase struct { 268 | volumes int 269 | expectedVolumes int 270 | } 271 | testCases := []testCase{ 272 | {volumes: 1, expectedVolumes: 1}, 273 | {volumes: 3, expectedVolumes: 3}, 274 | {volumes: 4, expectedVolumes: 4}, 275 | } 276 | for i := range testCases { 277 | c := testCases[i] 278 | By(fmt.Sprintf("Create %dth DIJob", i+1)) 279 | var err error 280 | jobTmpl := testutil.NewDIJob() 281 | jobTmpl.Spec.Volumes = make([]corev1.Volume, 0) 282 | for i := 0; i < c.volumes; i++ { 283 | volume := corev1.Volume{ 284 | Name: fmt.Sprintf("volume-%d", i), 285 | VolumeSource: corev1.VolumeSource{ 286 | HostPath: &corev1.HostPathVolumeSource{ 287 | Path: "/example", 288 | }, 289 | }, 290 | } 291 | jobTmpl.Spec.Volumes = append(jobTmpl.Spec.Volumes, volume) 292 | } 293 | 294 | job, jobKey := createAndUpdateReplicas(ctx, jobTmpl) 295 | 296 | By("Check the created DIJob is in Starting state") 297 | checkDIJobPhase(ctx, jobKey, div2alpha1.JobStarting) 298 | 299 | By("Check pod volumes are as expected") 300 | pods, err := ctx.ListJobPods(context.Background(), &job) 301 | Expect(err).NotTo(HaveOccurred()) 302 | for _, pod := range pods { 303 | Expect(len(pod.Spec.Volumes)).Should(Equal(c.expectedVolumes)) 304 | } 305 | 306 | By("Clean up pods") 307 | err = ctx.CleanUpJob(context.Background(), &job) 308 | Expect(err).NotTo(HaveOccurred()) 309 | } 310 | }) 311 | }) 312 | }) 313 | --------------------------------------------------------------------------------