├── .github └── workflows │ └── build.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── OWNERS ├── README.md ├── RELEASE.md ├── ROADMAP.md ├── docs ├── api │ ├── autogen │ │ ├── config.yaml │ │ └── templates │ │ │ ├── gv_details.tpl │ │ │ ├── gv_list.tpl │ │ │ ├── type.tpl │ │ │ └── type_members.tpl │ └── generated.asciidoc └── prometheus-metrics.md ├── go.mod ├── go.sum ├── hack ├── boilerplate │ └── boilerplate.go.txt ├── generate-apidoc.sh ├── scripts │ └── update-changelog.sh ├── update-codegen.sh ├── verify-codegen.sh └── verify-gomod.sh ├── linter_config.yaml ├── pkg ├── apis │ └── common │ │ └── v1 │ │ ├── constants.go │ │ ├── doc.go │ │ ├── interface.go │ │ ├── openapi_generated.go │ │ ├── types.go │ │ ├── zz_generated.deepcopy.go │ │ └── zz_generated.defaults.go ├── controller.v1 │ ├── common │ │ ├── job.go │ │ ├── job_controller.go │ │ ├── job_test.go │ │ ├── pod.go │ │ ├── pod_test.go │ │ ├── scheduling.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── status.go │ │ ├── status_test.go │ │ ├── util.go │ │ └── util_test.go │ ├── control │ │ ├── controller_ref_manager.go │ │ ├── controller_ref_manager_test.go │ │ ├── pod_control.go │ │ ├── pod_control_test.go │ │ ├── podgroup_control.go │ │ ├── service_control.go │ │ ├── service_control_test.go │ │ └── utils.go │ └── expectation │ │ ├── expectation.go │ │ ├── expectation_test.go │ │ └── util.go ├── core │ ├── job.go │ ├── pod.go │ ├── service.go │ ├── status.go │ └── utils.go ├── reconciler.v1 │ └── common │ │ ├── README.md │ │ ├── gang.go │ │ ├── gang_scheduler_framework.go │ │ ├── gang_volcano.go │ │ ├── interface.go │ │ ├── job.go │ │ ├── pod.go │ │ ├── pod_test.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── utils.go │ │ └── utils_test.go └── util │ ├── counter.go │ ├── k8sutil │ ├── client.go │ └── k8sutil.go │ ├── labels │ ├── labels.go │ └── labels_test.go │ ├── logger.go │ ├── signals │ ├── signal.go │ ├── signal_posix.go │ └── signal_windows.go │ ├── status.go │ ├── status_test.go │ ├── train │ ├── train_util.go │ └── train_util_test.go │ └── util.go └── test_job ├── README.md ├── apis └── test_job │ └── v1 │ ├── constants.go │ ├── defaults.go │ ├── doc.go │ ├── openapi_generated.go │ ├── register.go │ ├── types.go │ ├── zz_generated.deepcopy.go │ └── zz_generated.defaults.go ├── client ├── clientset │ └── versioned │ │ ├── clientset.go │ │ ├── doc.go │ │ ├── fake │ │ ├── clientset_generated.go │ │ ├── doc.go │ │ └── register.go │ │ ├── scheme │ │ ├── doc.go │ │ └── register.go │ │ └── typed │ │ └── test_job │ │ └── v1 │ │ ├── doc.go │ │ ├── fake │ │ ├── doc.go │ │ ├── fake_test_job_client.go │ │ └── fake_testjob.go │ │ ├── generated_expansion.go │ │ ├── test_job_client.go │ │ └── testjob.go ├── informers │ └── externalversions │ │ ├── factory.go │ │ ├── generic.go │ │ ├── internalinterfaces │ │ └── factory_interfaces.go │ │ └── test_job │ │ ├── interface.go │ │ └── v1 │ │ ├── interface.go │ │ └── testjob.go └── listers │ └── test_job │ └── v1 │ ├── expansion_generated.go │ └── testjob.go ├── controller.v1 └── test_job │ └── test_job_controller.go ├── reconciler.v1 └── test_job │ ├── dummy_client.go │ └── test_job_reconciler.go └── test_util └── v1 ├── const.go ├── pod.go ├── service.go ├── test_job_util.go └── util.go /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Build 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | env: 7 | GOPATH: ${{ github.workspace }} 8 | GO111MODULE: on 9 | defaults: 10 | run: 11 | working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/common 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | fetch-depth: 1 16 | path: ${{ env.GOPATH }}/src/github.com/kubeflow/common 17 | 18 | - name: Setup Go 19 | uses: actions/setup-go@v3 20 | with: 21 | go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/common/go.mod 22 | 23 | - uses: actions/cache@v3 24 | with: 25 | path: ~/go/pkg/mod 26 | key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} 27 | restore-keys: | 28 | ${{ runner.os }}-go- 29 | 30 | - name: Build 31 | run: | 32 | ./hack/verify-gomod.sh 33 | ./hack/verify-codegen.sh 34 | go build ./... 35 | go fmt ./... 36 | 37 | - name: Install dependencies 38 | run: | 39 | # get coveralls.io support 40 | go install github.com/mattn/goveralls@latest 41 | # Install golangci-lint 42 | curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin 43 | 44 | - name: Test 45 | run: | 46 | golangci-lint run --config=linter_config.yaml ./... 47 | # Here we run all tests in pkg and we have to use `-ignore` 48 | # since goveralls uses `filepath.Match` to match ignore files 49 | # and it does not support patterns like `**`. 50 | goveralls -service=github -v -package ./... -ignore "test_job/client/*/*.go,test_job/client/*/*/*.go,test_job/client/*/*/*/*.go,test_job/client/*/*/*/*/*.go,test_job/client/*/*/*/*/*/*.go,test_job/client/*/*/*/*/*/*/*.go,test_job/testutil/*.go,test_job/*/*/*/zz_generated.*.go,test_job/*/*/*/*_generated.go,pkg/apis/common/*/zz_generated.*.go,pkg/apis/common/*/*_generated.go" || echo "push to coveralls failed" 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE related 2 | .idea/ 3 | .vscode/ 4 | .swp 5 | 6 | # Items below are adopted from https://github.com/github/gitignore/blob/master/Go.gitignore 7 | 8 | ## Binaries for programs and plugins 9 | *.exe 10 | *.exe~ 11 | *.dll 12 | *.so 13 | *.dylib 14 | 15 | ## Test binary, built with `go test -c` 16 | *.test 17 | 18 | ## Output of the go coverage tool, specifically when used with LiteIDE 19 | *.out 20 | 21 | # Items below are adopted from https://github.com/github/gitignore/blob/master/Global/Linux.gitignore 22 | 23 | *~ 24 | 25 | ## temporary files which can be created if a process still has a handle open of a deleted file 26 | .fuse_hidden* 27 | 28 | ## KDE directory preferences 29 | .directory 30 | 31 | ## Linux trash folder which might appear on any partition or disk 32 | .Trash-* 33 | 34 | ## .nfs files are created when an open file is removed but is still being accessed 35 | .nfs* 36 | 37 | # Items below are adopted from https://github.com/github/gitignore/blob/master/Global/macOS.gitignore 38 | 39 | ## General 40 | .DS_Store 41 | .AppleDouble 42 | .LSOverride 43 | 44 | ## Icon must end with two \r 45 | Icon 46 | 47 | ## Thumbnails 48 | ._* 49 | 50 | ## Files that might appear in the root of a volume 51 | .DocumentRevisions-V100 52 | .fseventsd 53 | .Spotlight-V100 54 | .TemporaryItems 55 | .Trashes 56 | .VolumeIcon.icns 57 | .com.apple.timemachine.donotpresent 58 | 59 | ## Directories potentially created on remote AFP share 60 | .AppleDB 61 | .AppleDesktop 62 | Network Trash Folder 63 | Temporary Items 64 | .apdisk 65 | vendor/ 66 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | This doc is the contributing guideline for Kubeflow/common developers. 4 | 5 | ## Git development workflow 6 | 7 | We use the [GitHub flow](https://guides.github.com/introduction/flow/) for development. Please check it out to get familiar with the process. 8 | 9 | ## Before PR submission 10 | 11 | Before submitting a pull request, please make sure the code passes all the tests and is free of lint errors. The following sections outlines the instructions. 12 | 13 | ### Build 14 | 15 | ```bash 16 | # Build the package. 17 | go build ./... 18 | ``` 19 | 20 | ### Code formatting 21 | 22 | ```bash 23 | # Format your code. 24 | go fmt ./... 25 | ``` 26 | 27 | ### Code generation 28 | 29 | ```bash 30 | # Make sure to update the generated code if there are any API-level changes. 31 | ./hack/update-codegen.sh 32 | ``` 33 | 34 | ```bash 35 | # Make sure your API and client are update-to-date. 36 | ./hack/verify-codegen.sh 37 | ``` 38 | 39 | ### Code from upstream Kubernetes 40 | 41 | Some of the code is borrowed from upstream Kubernetes, such as [controller_utils.go](https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/controller_utils.go), which helps us remove the direct dependency on Kubernetes. For more background on this, please check out the discussions in [issue #48](https://github.com/kubeflow/common/issues/48). In addition, the following folders also contain some auxiliary codes to help us easily build the operators: 42 | 43 | - [control](./pkg/controller.v1/control) 44 | - [expectation](./pkg/controller.v1/expectation) 45 | 46 | *Note: Please don't edit these files. If you encounter any issues, please file an issue [here](https://github.com/kubeflow/common/issues).* 47 | 48 | We have a long-term plan to move them to [kubernetes/client-go](https://github.com/kubernetes/client-go). See issue [kubernetes/client-go/issues/332](https://github.com/kubernetes/client-go/issues/332) for more details. 49 | 50 | ## Additional Guideline for Maintainers 51 | 52 | If you are one of the maintainers of the repo, please check out the following additional guidelines. 53 | 54 | ### Commit style guide 55 | 56 | 1. Please always use bot to merge PRs and never manually merge PRs. 57 | 58 | 2. Release notes are generated from commits so writing good commit messages are important. Below is an example: 59 | 60 | ```md 61 | Enhance maintainability of operator common module (#55, @Jeffwan) 62 | Add proposal for Prometheus metrics coverage (#77, @terrytangyuan) 63 | ``` 64 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - gaocegege 3 | - terrytangyuan 4 | - Jeffwan 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kubeflow common for operators 2 | 3 | [![Build Status](https://github.com/kubeflow/common/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/kubeflow/common/actions/?query=workflow%3ABuild) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/common)](https://goreportcard.com/report/github.com/kubeflow/common) 5 | 6 | This repo contains the libraries for writing a custom job operators such as tf-operator and pytorch-operator. 7 | To write a custom operator, user need to do following steps 8 | 9 | - Generate operator skeleton using [kube-builder](https://github.com/kubernetes-sigs/kubebuilder) or [operator-sdk](https://github.com/operator-framework/operator-sdk) 10 | 11 | - Define job crd and reuse common API. Check [test_job](test_job) for full example. 12 | 13 | ```go 14 | import ( 15 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 16 | ) 17 | 18 | // reuse commonv1 api in your type.go 19 | RunPolicy *commonv1.RunPolicy `json:"runPolicy,omitempty"` 20 | TestReplicaSpecs map[TestReplicaType]*commonv1.ReplicaSpec `json:"testReplicaSpecs"` 21 | ``` 22 | 23 | - Write a custom controller that implements [controller interface](pkg/apis/common/v1/interface.go), such as the [TestJobController](test_job/controller.v1/test_job/test_job_controller.go) and instantiate a testJobController object 24 | ```go 25 | testJobController := TestJobController { 26 | ... 27 | } 28 | ``` 29 | - Instantiate a [JobController](pkg/controller.v1/common/job_controller.go) struct object and pass in the custom controller written in step 1 as a parameter 30 | ```go 31 | import "github.com/kubeflow/common/pkg/controller.v1/common" 32 | 33 | jobController := common.JobController { 34 | Controller: testJobController, 35 | Config: v1.JobControllerConfiguration{EnableGangScheduling: false}, 36 | Recorder: recorder, 37 | } 38 | ``` 39 | - Within you main reconcile loop, call the [JobController.ReconcileJobs](pkg/controller.v1/common/job.go) method. 40 | ```go 41 | reconcile(...) { 42 | // Your main reconcile loop. 43 | ... 44 | jobController.ReconcileJobs(...) 45 | ... 46 | } 47 | 48 | ``` 49 | Note that this repo is still under construction, API compatibility is not guaranteed at this point. 50 | 51 | ## API Reference 52 | 53 | Please refer to the [API documentation](docs/api/generated.asciidoc). 54 | 55 | The API files are located under `pkg/apis/common/v1`: 56 | 57 | - [constants.go](pkg/apis/common/v1/constants.go): the constants such as label keys. 58 | - [interface.go](pkg/apis/common/v1/interface.go): the interfaces to be implemented by custom controllers. 59 | - [controller.go](pkg/controller.v1/common/job_controller.go): the main `JobController` that contains the `ReconcileJobs` API method to be invoked by user. This is the entrypoint of 60 | the `JobController` logic. The rest of the code under `job_controller/` folder contains the core logic for the `JobController` to work, such as creating and managing worker pods, services, etc. 61 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Kubeflow Distributed Training Operators 2020 Roadmap 2 | 3 | This document outlines the main directions on the Kubeflow Distributed Training Operators in 2020. 4 | 5 | ## Maintenance and reliability 6 | 7 | We will continue developing capabilities for better reliability, scaling, and maintenance of production distributed training experiences provided by operators. 8 | 9 | * Enhance maintainability of operator common module. Related issue: [#54](https://github.com/kubeflow/common/issues/54). 10 | * Migrate operators to use [kubeflow/common](https://github.com/kubeflow/common) APIs. Related issue: [#64](https://github.com/kubeflow/common/issues/64). 11 | * Graduate MPI Operator, MXNet Operator and XGBoost Operator to v1. Related issue: [#65](https://github.com/kubeflow/common/issues/65). 12 | 13 | ## Features 14 | 15 | To take advantages of other capabilities of job scheduler components, operators will expose more APIs for advanced scheduling. More features will be added to simplify usage like dynamic volume supports and git ops experiences. In order to make it easily used in the Kubeflow ecosystem, we can add more launcher KFP components for adoption. 16 | 17 | * Support dynamic volume provisioning for distributed training jobs. Related issue: [#19](https://github.com/kubeflow/common/issues/19). 18 | * MLOps - Allow user to submit jobs using Git repo without building container images. Related issue: [#66](https://github.com/kubeflow/common/issues/66). 19 | * Add Job priority and Queue in SchedulingPolicy for advanced scheduling in common operator. Related issue: [#46](https://github.com/kubeflow/common/issues/46). 20 | * Add pipeline launcher components for different training jobs. Related issue: [pipeline#3445](https://github.com/kubeflow/pipelines/issues/3445). 21 | 22 | ## Monitoring 23 | 24 | * Provides a standardized logging interface. Related issue: [#60](https://github.com/kubeflow/common/issues/60). 25 | * Expose generic prometheus metrics in common operators. Related issue: [#22](https://github.com/kubeflow/common/issues/22). 26 | * Centralized Job Dashboard for training jobs (Add metadata graph, model artifacts later). Related issue: [#67](https://github.com/kubeflow/common/issues/67). 27 | 28 | ## Performance 29 | 30 | Continue to optimize reconciler performance and reduce latency to take actions on CR events. 31 | 32 | * Performance optimization for 500 concurrent jobs and large scale completed jobs. Related issues: [#68](https://github.com/kubeflow/common/issues/68), [tf-operator#965](https://github.com/kubeflow/tf-operator/issues/965), and [tf-operator#1079](https://github.com/kubeflow/tf-operator/issues/1079). 33 | -------------------------------------------------------------------------------- /docs/api/autogen/config.yaml: -------------------------------------------------------------------------------- 1 | render: 2 | kubernetesVersion: "1.16" -------------------------------------------------------------------------------- /docs/api/autogen/templates/gv_details.tpl: -------------------------------------------------------------------------------- 1 | {{- define "gvDetails" -}} 2 | {{- $gv := . -}} 3 | [id="{{ asciidocGroupVersionID $gv | asciidocRenderAnchorID }}"] 4 | == {{ $gv.GroupVersionString }} 5 | 6 | {{ $gv.Doc }} 7 | 8 | {{- if $gv.Kinds }} 9 | .Resource Types 10 | {{- range $gv.SortedKinds }} 11 | - {{ $gv.TypeForKind . | asciidocRenderTypeLink }} 12 | {{- end }} 13 | {{ end }} 14 | 15 | === Definitions 16 | {{ range $gv.SortedTypes }} 17 | {{ template "type" . }} 18 | {{ end }} 19 | 20 | {{- end -}} -------------------------------------------------------------------------------- /docs/api/autogen/templates/gv_list.tpl: -------------------------------------------------------------------------------- 1 | {{- define "gvList" -}} 2 | {{- $groupVersions := . -}} 3 | 4 | // Generated documentation. Please do not edit. 5 | :anchor_prefix: k8s-api 6 | 7 | [id="{p}-api-reference"] 8 | = API Reference 9 | 10 | .Packages 11 | {{- range $groupVersions }} 12 | - {{ asciidocRenderGVLink . }} 13 | {{- end }} 14 | 15 | {{ range $groupVersions }} 16 | {{ template "gvDetails" . }} 17 | {{ end }} 18 | 19 | {{- end -}} -------------------------------------------------------------------------------- /docs/api/autogen/templates/type.tpl: -------------------------------------------------------------------------------- 1 | {{- define "type" -}} 2 | {{- $type := . -}} 3 | {{- if asciidocShouldRenderType $type -}} 4 | 5 | [id="{{ asciidocTypeID $type | asciidocRenderAnchorID }}"] 6 | ==== {{ $type.Name }} {{ if $type.IsAlias }}({{ asciidocRenderTypeLink $type.UnderlyingType }}) {{ end }} 7 | 8 | {{ $type.Doc }} 9 | 10 | {{ if $type.References -}} 11 | .Appears In: 12 | **** 13 | {{- range $type.SortedReferences }} 14 | - {{ asciidocRenderTypeLink . }} 15 | {{- end }} 16 | **** 17 | {{- end }} 18 | 19 | {{ if $type.Members -}} 20 | [cols="25a,75a", options="header"] 21 | |=== 22 | | Field | Description 23 | {{ if $type.GVK -}} 24 | | *`apiVersion`* __string__ | `{{ $type.GVK.Group }}/{{ $type.GVK.Version }}` 25 | | *`kind`* __string__ | `{{ $type.GVK.Kind }}` 26 | {{ end -}} 27 | 28 | {{ range $type.Members -}} 29 | | *`{{ .Name }}`* __{{ asciidocRenderType .Type }}__ | {{ template "type_members" . }} 30 | {{ end -}} 31 | |=== 32 | {{ end -}} 33 | 34 | {{- end -}} 35 | {{- end -}} -------------------------------------------------------------------------------- /docs/api/autogen/templates/type_members.tpl: -------------------------------------------------------------------------------- 1 | {{- define "type_members" -}} 2 | {{- $field := . -}} 3 | {{- if eq $field.Name "metadata" -}} 4 | Refer to Kubernetes API documentation for fields of `metadata`. 5 | {{ else -}} 6 | {{ $field.Doc }} 7 | {{- end -}} 8 | {{- end -}} -------------------------------------------------------------------------------- /docs/api/generated.asciidoc: -------------------------------------------------------------------------------- 1 | // Generated documentation. Please do not edit. 2 | :anchor_prefix: k8s-api 3 | 4 | [id="{p}-api-reference"] 5 | = API Reference 6 | 7 | .Packages 8 | - xref:{anchor_prefix}-kubeflow-org-v1[$$kubeflow.org/v1$$] 9 | 10 | 11 | [id="{anchor_prefix}-kubeflow-org-v1"] 12 | == kubeflow.org/v1 13 | 14 | Package v1 is the v1 version of the API. 15 | 16 | 17 | Licensed under the Apache License, Version 2.0 (the "License"); 18 | you may not use this file except in compliance with the License. 19 | You may obtain a copy of the License at 20 | 21 | http://www.apache.org/licenses/LICENSE-2.0 22 | 23 | Unless required by applicable law or agreed to in writing, software 24 | distributed under the License is distributed on an "AS IS" BASIS, 25 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | See the License for the specific language governing permissions and 27 | limitations under the License. 28 | 29 | 30 | === Definitions 31 | 32 | 33 | 34 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobcondition"] 35 | ==== JobCondition 36 | 37 | JobCondition describes the state of the job at a certain point. 38 | 39 | .Appears In: 40 | **** 41 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobstatus[$$JobStatus$$] 42 | **** 43 | 44 | [cols="25a,75a", options="header"] 45 | |=== 46 | | Field | Description 47 | | *`type`* __xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobconditiontype[$$JobConditionType$$]__ | Type of job condition. 48 | | *`status`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.16/#conditionstatus-v1-core[$$ConditionStatus$$]__ | Status of the condition, one of True, False, Unknown. 49 | | *`reason`* __string__ | The reason for the condition's last transition. 50 | | *`message`* __string__ | A human readable message indicating details about the transition. 51 | | *`lastUpdateTime`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.16/#time-v1-meta[$$Time$$]__ | The last time this condition was updated. 52 | | *`lastTransitionTime`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.16/#time-v1-meta[$$Time$$]__ | Last time the condition transitioned from one status to another. 53 | |=== 54 | 55 | 56 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobconditiontype"] 57 | ==== JobConditionType (string) 58 | 59 | JobConditionType defines all kinds of types of JobStatus. 60 | 61 | .Appears In: 62 | **** 63 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobcondition[$$JobCondition$$] 64 | **** 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-replicastatus"] 73 | ==== ReplicaStatus 74 | 75 | ReplicaStatus represents the current observed state of the replica. 76 | 77 | .Appears In: 78 | **** 79 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobstatus[$$JobStatus$$] 80 | **** 81 | 82 | [cols="25a,75a", options="header"] 83 | |=== 84 | | Field | Description 85 | | *`active`* __integer__ | The number of actively running pods. 86 | | *`succeeded`* __integer__ | The number of pods which reached phase Succeeded. 87 | | *`failed`* __integer__ | The number of pods which reached phase Failed. 88 | |=== 89 | 90 | 91 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-replicatype"] 92 | ==== ReplicaType (string) 93 | 94 | ReplicaType represents the type of the replica. Each operator needs to define its own set of ReplicaTypes. 95 | 96 | .Appears In: 97 | **** 98 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobstatus[$$JobStatus$$] 99 | **** 100 | 101 | 102 | 103 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-restartpolicy"] 104 | ==== RestartPolicy (string) 105 | 106 | RestartPolicy describes how the replicas should be restarted. Only one of the following restart policies may be specified. If none of the following policies is specified, the default one is RestartPolicyAlways. 107 | 108 | .Appears In: 109 | **** 110 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-replicaspec[$$ReplicaSpec$$] 111 | **** 112 | 113 | 114 | 115 | 116 | 117 | [id="{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-schedulingpolicy"] 118 | ==== SchedulingPolicy 119 | 120 | SchedulingPolicy encapsulates various scheduling policies of the distributed training job, for example `minAvailable` for gang-scheduling. 121 | 122 | .Appears In: 123 | **** 124 | - xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-runpolicy[$$RunPolicy$$] 125 | **** 126 | 127 | [cols="25a,75a", options="header"] 128 | |=== 129 | | Field | Description 130 | | *`minAvailable`* __integer__ | 131 | |=== 132 | 133 | 134 | -------------------------------------------------------------------------------- /docs/prometheus-metrics.md: -------------------------------------------------------------------------------- 1 | # Prometheus Metrics Coverage 2 | 3 | We plan to collect a rich set of metrics in kubeflow/common's `JobController` using [Prometheus](https://prometheus.io/). 4 | The goal is to report generic metrics (e.g. metrics related to pods/jobs/services) during the lifecycle of `JobController` so that: 5 | 6 | * Other operators built on top of it will automatically report Prometheus metrics without additional efforts; 7 | * It is easier for users of Kubeflow distributed training operators to monitor operator performance and behaviors using consistent set of metrics for different distributed training operators. 8 | 9 | This document outlines the list of Prometheus metrics we plan to cover in `JobController`. We follow the metric naming convention 10 | outlined [here](https://prometheus.io/docs/practices/naming/) and the metric types supported by Prometheus [here](https://prometheus.io/docs/concepts/metric_types/). 11 | 12 | ## Pod Metrics 13 | 14 | The following metrics related to the lifecycle of pods will be reported: 15 | 16 | | Metric Name | Metric Type | Description | 17 | | ----------- | ------------| ----------- | 18 | | created_pods_total | Counter | The total number of created pods | 19 | | restarted_pods_total | Counter | The total number of restarted pods | 20 | | deleted_pods_total | Counter | The total number of deleted pods | 21 | | failed_pods_total | Counter | The total number of failed pods | 22 | 23 | The following metrics will be reported on each pod: 24 | 25 | | Metric Name | Metric Type | Description | 26 | | ----------- | ------------| ----------- | 27 | | container_cpu_usage_seconds_total | Counter | Cumulative cpu time consumed in seconds | 28 | | container_accelerator_memory_used_bytes | Gauge | Total accelerator memory allocated | 29 | | container_memory_usage_bytes | Gauge | Current memory usage in bytes, including all memory regardless of when it was accessed | 30 | | container_network_transmit_bytes_total | Counter | Cumulative count of bytes transmitted | 31 | | container_fs_usage_bytes | Gauge | Number of bytes that are consumed by the container on this filesystem | 32 | | up | Gauge | Keep-Alive check (maintained by Prometheus on its own with its `up` metric detailed in the documentation [here](https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series))) | 33 | 34 | Note that some of the above metrics are derived from [cAdvisor](https://github.com/google/cadvisor) kubelet 35 | integration which reports to Prometheus through our prometheus-operator installation. 36 | 37 | ## Job Metrics 38 | 39 | The following metrics related to the lifecycle of jobs will be reported: 40 | 41 | | Metric Name | Metric Type | Description | 42 | | ----------- | ------------| ----------- | 43 | | created_jobs_total | Counter | The total number of created jobs | 44 | | deleted_jobs_total | Counter | The total number of deleted jobs | 45 | | completed_jobs_total | Counter | The total number of completed jobs | 46 | | restarted_jobs_total | Counter | The total number of restarted jobs | 47 | | pending_jobs_total | Gauge | The total number of pending jobs | 48 | | failed_jobs_total | Counter | The total number of failed jobs | 49 | | running_jobs_total | Gauge | The total number of running jobs | 50 | 51 | The following metrics related to the duration among various job phases will be reported: 52 | 53 | | Metric Name | Metric Type | Description | 54 | | ----------- | ------------| ----------- | 55 | | from_created_to_completed_job_duration_seconds_total | Histogram | The duration between job created and job completed in seconds | 56 | | from_completed_to_deleted_job_duration_seconds_total | Histogram | The duration between job completed and job deleted in seconds | 57 | | from_failed_to_restarted_job_duration_seconds_total | Histogram | The duration between job failed and job restarted in seconds | 58 | | from_pending_to_running_job_duration_seconds_total | Histogram | The duration between job pending and job running in seconds | 59 | 60 | ## Service Metrics 61 | 62 | The following metrics related to the lifecycle of services will be reported: 63 | 64 | | Metric Name | Metric Type | Description | 65 | | ----------- | ------------| ----------- | 66 | | succeeded_service_creations_total | Counter | The total number of succeeded service creations | 67 | | failed_service_creations_total | Counter | The total number of failed service creations | 68 | | restarted_service_creations_total | Counter | The total number of restarted service creations | 69 | | service_patches_total | Counter | The total number of service patches | 70 | | deleted_services_total | Counter | The total number of deleted services | 71 | 72 | ## Scheduling Metrics 73 | 74 | The following metrics related to scheduling will be reported: 75 | 76 | | Metric Name | Metric Type | Description | 77 | | ----------- | ------------| ----------- | 78 | | created_pod_disruption_policies_total | Counter | The total number of created pod disruption policies | 79 | | deleted_pod_disruption_policies_total | Counter | The total number of deleted pod disruption policies | 80 | | created_pod_groups_total | Counter | The total number of created pod groups | 81 | | deleted_pod_groups_total | Counter | The total number of deleted pod groups | 82 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kubeflow/common 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/go-logr/logr v1.2.3 7 | github.com/google/go-cmp v0.5.8 8 | github.com/prometheus/client_golang v1.12.2 9 | github.com/sirupsen/logrus v1.8.1 10 | github.com/stretchr/testify v1.8.0 11 | k8s.io/api v0.25.3 12 | k8s.io/apimachinery v0.25.3 13 | k8s.io/client-go v0.25.3 14 | k8s.io/code-generator v0.25.3 15 | k8s.io/klog/v2 v2.70.1 16 | k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 17 | k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed 18 | sigs.k8s.io/controller-runtime v0.13.0 19 | sigs.k8s.io/scheduler-plugins v0.24.9 20 | volcano.sh/apis v1.2.0-k8s1.19.6 21 | ) 22 | 23 | require ( 24 | cloud.google.com/go v0.97.0 // indirect 25 | github.com/PuerkitoBio/purell v1.1.1 // indirect 26 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect 27 | github.com/beorn7/perks v1.0.1 // indirect 28 | github.com/cespare/xxhash/v2 v2.1.2 // indirect 29 | github.com/davecgh/go-spew v1.1.1 // indirect 30 | github.com/emicklei/go-restful/v3 v3.8.0 // indirect 31 | github.com/evanphx/json-patch v4.12.0+incompatible // indirect 32 | github.com/evanphx/json-patch/v5 v5.6.0 // indirect 33 | github.com/fsnotify/fsnotify v1.5.4 // indirect 34 | github.com/go-openapi/jsonpointer v0.19.5 // indirect 35 | github.com/go-openapi/jsonreference v0.19.5 // indirect 36 | github.com/go-openapi/swag v0.19.14 // indirect 37 | github.com/gogo/protobuf v1.3.2 // indirect 38 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 39 | github.com/golang/protobuf v1.5.2 // indirect 40 | github.com/google/gnostic v0.5.7-v3refs // indirect 41 | github.com/google/gofuzz v1.1.0 // indirect 42 | github.com/google/uuid v1.3.0 // indirect 43 | github.com/imdario/mergo v0.3.12 // indirect 44 | github.com/josharian/intern v1.0.0 // indirect 45 | github.com/json-iterator/go v1.1.12 // indirect 46 | github.com/mailru/easyjson v0.7.6 // indirect 47 | github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect 48 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 49 | github.com/modern-go/reflect2 v1.0.2 // indirect 50 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 51 | github.com/pkg/errors v0.9.1 // indirect 52 | github.com/pmezard/go-difflib v1.0.0 // indirect 53 | github.com/prometheus/client_model v0.2.0 // indirect 54 | github.com/prometheus/common v0.32.1 // indirect 55 | github.com/prometheus/procfs v0.7.3 // indirect 56 | github.com/spf13/pflag v1.0.5 // indirect 57 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect 58 | golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10 // indirect 59 | golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect 60 | golang.org/x/sys v0.3.0 // indirect 61 | golang.org/x/term v0.3.0 // indirect 62 | golang.org/x/text v0.5.0 // indirect 63 | golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect 64 | golang.org/x/tools v0.1.12 // indirect 65 | gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect 66 | google.golang.org/appengine v1.6.7 // indirect 67 | google.golang.org/protobuf v1.28.0 // indirect 68 | gopkg.in/inf.v0 v0.9.1 // indirect 69 | gopkg.in/yaml.v2 v2.4.0 // indirect 70 | gopkg.in/yaml.v3 v3.0.1 // indirect 71 | k8s.io/apiextensions-apiserver v0.25.0 // indirect 72 | k8s.io/component-base v0.25.0 // indirect 73 | sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect 74 | sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect 75 | sigs.k8s.io/yaml v1.3.0 // indirect 76 | ) 77 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | // Copyright YEAR The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | -------------------------------------------------------------------------------- /hack/generate-apidoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell is used to auto generate some useful tools for k8s, such as lister, 18 | # informer, deepcopy, defaulter and so on. 19 | 20 | set -o errexit 21 | set -o nounset 22 | set -o pipefail 23 | 24 | SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/.. 25 | 26 | cd ${SCRIPT_ROOT} 27 | 28 | crd-ref-docs --log-level DEBUG\ 29 | --source-path ./pkg/apis/common/v1 \ 30 | --config ./docs/api/autogen/config.yaml \ 31 | --templates-dir ./docs/api/autogen/templates \ 32 | --output-path ./docs/api/generated.asciidoc \ 33 | --max-depth 30 34 | 35 | cd - > /dev/null 36 | -------------------------------------------------------------------------------- /hack/scripts/update-changelog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Update CHANGELOG.md using github_changelog_generator. 18 | # 19 | # The script will compute changes between release tags. So make sure there is 20 | # a release tag corresponding to the release you want to compute the changes 21 | # for. 22 | set -o errexit 23 | set -o nounset 24 | set -o pipefail 25 | 26 | GITHUB_TOKEN=${GITHUB_TOKEN:-"NO"} 27 | 28 | SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/../.. 29 | 30 | cd ${SCRIPT_ROOT} 31 | 32 | if [ "${GITHUB_TOKEN}" == "NO" ] 33 | then 34 | echo "Environment variable GITHUB_TOKEN is not set." 35 | exit 1 36 | fi 37 | 38 | github_changelog_generator -t ${GITHUB_TOKEN} -u kubeflow -p common \ 39 | --exclude-labels community/discussion,cmmunity/question,duplicate,question,invalid,wontfix \ 40 | --bug-labels kind/bug,problems/bug \ 41 | --enhancement-labels improvement/optimization,kind/enhancement,improvement/enhancement,addition/feature,kind/feature \ 42 | --enhancement-label "**Features and improvements:**" 43 | 44 | cd - > /dev/null 45 | -------------------------------------------------------------------------------- /hack/update-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell is used to auto generate some useful tools for k8s, such as lister, 18 | # informer, deepcopy, defaulter and so on. 19 | 20 | set -o errexit 21 | set -o nounset 22 | set -o pipefail 23 | 24 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 25 | echo ">> Script root ${SCRIPT_ROOT}" 26 | ROOT_PKG=github.com/kubeflow/common 27 | 28 | # Grab code-generator version from go.mod 29 | CODEGEN_VERSION=$(grep 'k8s.io/code-generator' go.mod | awk '{print $2}' | head -1) 30 | CODEGEN_PKG=$(echo `go env GOPATH`"/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}") 31 | 32 | # Grab kube-openapi version from go.mod 33 | OPENAPI_VERSION=$(grep 'k8s.io/kube-openapi' go.mod | awk '{print $2}' | head -1) 34 | # remove /go.mod if it happens to match the version 35 | if [[ $OPENAPI_VERSION == */go.mod ]]; then 36 | OPENAPI_VERSION=${OPENAPI_VERSION%/*} 37 | fi 38 | 39 | OPENAPI_PKG=$(echo `go env GOPATH`"/pkg/mod/k8s.io/kube-openapi@${OPENAPI_VERSION}") 40 | 41 | if [[ ! -d ${CODEGEN_PKG} || ! -d ${OPENAPI_PKG} ]]; then 42 | echo "${CODEGEN_PKG} or ${OPENAPI_PKG} is missing. Running 'go mod download'." 43 | go mod download 44 | fi 45 | 46 | echo ">> Using ${CODEGEN_PKG}" 47 | echo ">> Using ${OPENAPI_PKG}" 48 | # Ensure we can execute shell scripts. 49 | chmod +x ${CODEGEN_PKG}/generate-groups.sh 50 | 51 | # code-generator does work with go.mod but makes assumptions about 52 | # the project living in `$GOPATH/src`. To work around this and support 53 | # any location; create a temporary directory, use this as an output 54 | # base, and copy everything back once generated. 55 | TEMP_DIR=$(mktemp -d) 56 | cleanup() { 57 | echo ">> Removing ${TEMP_DIR}" 58 | rm -rf ${TEMP_DIR} 59 | } 60 | trap "cleanup" EXIT SIGINT 61 | 62 | echo ">> Temporary output directory ${TEMP_DIR}" 63 | 64 | # generate the code with: 65 | # --output-base because this script should also be able to run inside the vendor dir of 66 | # k8s.io/kubernetes. The output-base is needed for the generators to output into the vendor dir 67 | # instead of the $GOPATH directly. For normal projects this can be dropped. 68 | cd ${SCRIPT_ROOT} 69 | ${CODEGEN_PKG}/generate-groups.sh "deepcopy" \ 70 | github.com/kubeflow/common/pkg/client github.com/kubeflow/common/pkg/apis \ 71 | common:v1 \ 72 | --output-base "${TEMP_DIR}" \ 73 | --go-header-file hack/boilerplate/boilerplate.go.txt 74 | 75 | ${CODEGEN_PKG}/generate-groups.sh "all" \ 76 | github.com/kubeflow/common/test_job/client github.com/kubeflow/common/test_job/apis \ 77 | test_job:v1 \ 78 | --output-base "${TEMP_DIR}" \ 79 | --go-header-file hack/boilerplate/boilerplate.go.txt 80 | 81 | # Notice: The code in code-generator does not generate defaulter by default. 82 | # We need to build binary from vendor cmd folder. 83 | #echo "Building defaulter-gen" 84 | #go get k8s.io/code-generator/cmd/defaulter-gen@v0.19.9 85 | #go build -o ${GOPATH}/bin/defaulter-gen ${CODEGEN_PKG}/cmd/defaulter-gen 86 | 87 | echo "Generating defaulters for common/v1" 88 | ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/common/pkg/apis/common/v1 \ 89 | -O zz_generated.defaults \ 90 | --output-package github.com/kubeflow/common/pkg/apis/common/v1 \ 91 | --go-header-file hack/boilerplate/boilerplate.go.txt "$@" \ 92 | --output-base "${TEMP_DIR}" 93 | 94 | echo "Generating defaulters for test_job/v1" 95 | ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/common/test_job/apis/test_job/v1 \ 96 | -O zz_generated.defaults \ 97 | --output-package github.com/kubeflow/common/test_job/apis/test_job/v1 \ 98 | --go-header-file hack/boilerplate/boilerplate.go.txt "$@" \ 99 | --output-base "${TEMP_DIR}" 100 | 101 | echo "Building openapi-gen" 102 | GOFLAGS=-mod=mod go build -o ${GOPATH}/bin/openapi-gen ${OPENAPI_PKG}/cmd/openapi-gen 103 | 104 | echo "Generating OpenAPI specification for common/v1" 105 | ${GOPATH}/bin/openapi-gen --input-dirs github.com/kubeflow/common/pkg/apis/common/v1 \ 106 | --output-package github.com/kubeflow/common/pkg/apis/common/v1 \ 107 | --go-header-file hack/boilerplate/boilerplate.go.txt "$@" \ 108 | --output-base "${TEMP_DIR}" 109 | 110 | echo "Generating OpenAPI specification for test_job/v1" 111 | ${GOPATH}/bin/openapi-gen --input-dirs github.com/kubeflow/common/test_job/apis/test_job/v1 \ 112 | --output-package github.com/kubeflow/common/test_job/apis/test_job/v1 \ 113 | --go-header-file hack/boilerplate/boilerplate.go.txt "$@" \ 114 | --output-base "${TEMP_DIR}" 115 | 116 | ## Copy everything back. 117 | cp -a "${TEMP_DIR}/${ROOT_PKG}/." "${SCRIPT_ROOT}/" 118 | -------------------------------------------------------------------------------- /hack/verify-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. 22 | 23 | DIFFROOT="${SCRIPT_ROOT}/pkg" 24 | TMP_DIFFROOT="${SCRIPT_ROOT}/_tmp/pkg" 25 | _tmp="${SCRIPT_ROOT}/_tmp" 26 | 27 | cleanup() { 28 | rm -rf "${_tmp}" 29 | } 30 | trap "cleanup" EXIT SIGINT 31 | 32 | cleanup 33 | 34 | mkdir -p "${TMP_DIFFROOT}" 35 | cp -a "${DIFFROOT}"/* "${TMP_DIFFROOT}" 36 | 37 | "${SCRIPT_ROOT}/hack/update-codegen.sh" 38 | echo "diffing ${DIFFROOT} against freshly generated codegen" 39 | ret=0 40 | diff -Naupr "${DIFFROOT}" "${TMP_DIFFROOT}" || ret=$? 41 | cp -a "${TMP_DIFFROOT}"/* "${DIFFROOT}" 42 | if [[ $ret -eq 0 ]] 43 | then 44 | echo "${DIFFROOT} up to date." 45 | else 46 | echo "${DIFFROOT} is out of date. Please run hack/update-codegen.sh" 47 | exit 1 48 | fi 49 | -------------------------------------------------------------------------------- /hack/verify-gomod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2021 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | set -o errexit 19 | set -o nounset 20 | set -o pipefail 21 | 22 | go mod tidy 23 | STATUS=$( git status --porcelain go.mod go.sum ) 24 | if [ ! -z "$STATUS" ]; then 25 | echo "Running go mod tidy modified go.mod and/or go.sum" 26 | git --no-pager diff -p 27 | exit 1 28 | fi 29 | exit 0 -------------------------------------------------------------------------------- /linter_config.yaml: -------------------------------------------------------------------------------- 1 | # This file contains golangci-lint configurations 2 | 3 | run: 4 | # default concurrency is a available CPU number 5 | concurrency: 4 6 | 7 | # timeout for analysis, e.g. 30s, 5m, default is 1m 8 | timeout: 300s 9 | 10 | # exit code when at least one issue was found, default is 1 11 | issues-exit-code: 1 12 | 13 | # include test files or not, default is true 14 | tests: true 15 | 16 | # which dirs to skip: issues from them won't be reported; 17 | skip-dirs: 18 | - test_job/client 19 | - pkg/apis/common/v1/zz_generated.deepcopy.go 20 | - pkg/apis/common/v1/zz_generated.defaults.go 21 | - test_job/apis/test_job/v1/zz_generated.deepcopy.go 22 | - test_job/apis/test_job/v1/zz_generated.defaults.go 23 | 24 | # default is true. Enables skipping of directories: 25 | # vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ 26 | skip-dirs-use-default: true 27 | 28 | linters: 29 | # please, do not use `enable-all`: it's deprecated and will be removed soon. 30 | # inverted configuration with `enable-all` and `disable` is not scalable during updates of golangci-lint 31 | disable-all: true 32 | enable: 33 | - bodyclose 34 | - deadcode 35 | - misspell 36 | - lll 37 | - typecheck 38 | - unconvert 39 | - unused 40 | - varcheck 41 | - govet 42 | - staticcheck 43 | linters-settings: 44 | lll: 45 | # max line length, lines longer will be reported. Default is 120. 46 | line-length: 240 47 | -------------------------------------------------------------------------------- /pkg/apis/common/v1/constants.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1 18 | 19 | const ( 20 | 21 | // ReplicaIndexLabel represents the label key for the replica-index, e.g. 0, 1, 2.. etc 22 | ReplicaIndexLabel = "training.kubeflow.org/replica-index" 23 | 24 | // ReplicaTypeLabel represents the label key for the replica-type, e.g. ps, worker etc. 25 | ReplicaTypeLabel = "training.kubeflow.org/replica-type" 26 | 27 | // OperatorNameLabel represents the label key for the operator name, e.g. tf-operator, mpi-operator, etc. 28 | OperatorNameLabel = "training.kubeflow.org/operator-name" 29 | 30 | // JobNameLabel represents the label key for the job name, the value is the job name. 31 | JobNameLabel = "training.kubeflow.org/job-name" 32 | 33 | // JobRoleLabel represents the label key for the job role, e.g. master. 34 | JobRoleLabel = "training.kubeflow.org/job-role" 35 | ) 36 | -------------------------------------------------------------------------------- /pkg/apis/common/v1/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +k8s:deepcopy-gen=package,register 16 | // +k8s:defaulter-gen=TypeMeta 17 | 18 | // Package v1 is the v1 version of the API. 19 | // +groupName=kubeflow.org 20 | package v1 21 | -------------------------------------------------------------------------------- /pkg/apis/common/v1/interface.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1 18 | 19 | import ( 20 | v1 "k8s.io/api/core/v1" 21 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | "k8s.io/apimachinery/pkg/runtime/schema" 23 | ) 24 | 25 | // ControllerInterface defines the Interface to be implemented by custom operators. e.g. tf-operator needs to implement this interface 26 | type ControllerInterface interface { 27 | // Returns the Controller name 28 | ControllerName() string 29 | 30 | // Returns the GroupVersionKind of the API 31 | GetAPIGroupVersionKind() schema.GroupVersionKind 32 | 33 | // Returns the GroupVersion of the API 34 | GetAPIGroupVersion() schema.GroupVersion 35 | 36 | // Returns the Group Name(value) in the labels of the job 37 | GetGroupNameLabelValue() string 38 | 39 | // Returns the Job from Informer Cache 40 | GetJobFromInformerCache(namespace, name string) (metav1.Object, error) 41 | 42 | // Returns the Job from API server 43 | GetJobFromAPIClient(namespace, name string) (metav1.Object, error) 44 | 45 | // GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name" 46 | // i.e. all pods created by the job will come with label "job-name" = 47 | GetPodsForJob(job interface{}) ([]*v1.Pod, error) 48 | 49 | // GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name" 50 | // i.e. all services created by the job will come with label "job-name" = 51 | GetServicesForJob(job interface{}) ([]*v1.Service, error) 52 | 53 | // DeleteJob deletes the job 54 | DeleteJob(job interface{}) error 55 | 56 | // UpdateJobStatus updates the job status and job conditions 57 | UpdateJobStatus(job interface{}, replicas map[ReplicaType]*ReplicaSpec, jobStatus *JobStatus) error 58 | 59 | // UpdateJobStatusInApiServer updates the job status in API server 60 | UpdateJobStatusInApiServer(job interface{}, jobStatus *JobStatus) error 61 | 62 | // SetClusterSpec sets the cluster spec for the pod 63 | SetClusterSpec(job interface{}, podTemplate *v1.PodTemplateSpec, rtype, index string) error 64 | 65 | // Returns the default container name in pod 66 | GetDefaultContainerName() string 67 | 68 | // Get the default container port name 69 | GetDefaultContainerPortName() string 70 | 71 | // Returns if this replica type with index specified is a master role. 72 | // MasterRole pod will have "job-role=master" set in its label 73 | IsMasterRole(replicas map[ReplicaType]*ReplicaSpec, rtype ReplicaType, index int) bool 74 | 75 | // ReconcileJobs checks and updates replicas for each given ReplicaSpec of a job. 76 | // Common implementation will be provided and User can still override this to implement their own reconcile logic 77 | ReconcileJobs(job interface{}, replicas map[ReplicaType]*ReplicaSpec, jobStatus JobStatus, runPolicy *RunPolicy) error 78 | 79 | // ReconcilePods checks and updates pods for each given ReplicaSpec. 80 | // It will requeue the job in case of an error while creating/deleting pods. 81 | // Common implementation will be provided and User can still override this to implement their own reconcile logic 82 | ReconcilePods(job interface{}, jobStatus *JobStatus, pods []*v1.Pod, rtype ReplicaType, spec *ReplicaSpec, 83 | replicas map[ReplicaType]*ReplicaSpec) error 84 | 85 | // ReconcileServices checks and updates services for each given ReplicaSpec. 86 | // It will requeue the job in case of an error while creating/deleting services. 87 | // Common implementation will be provided and User can still override this to implement their own reconcile logic 88 | ReconcileServices(job metav1.Object, services []*v1.Service, rtype ReplicaType, spec *ReplicaSpec) error 89 | } 90 | -------------------------------------------------------------------------------- /pkg/apis/common/v1/zz_generated.defaults.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | // +build !ignore_autogenerated 3 | 4 | // Copyright 2023 The Kubeflow Authors 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | // Code generated by defaulter-gen. DO NOT EDIT. 19 | 20 | package v1 21 | 22 | import ( 23 | runtime "k8s.io/apimachinery/pkg/runtime" 24 | ) 25 | 26 | // RegisterDefaults adds defaulters functions to the given scheme. 27 | // Public to allow building arbitrary schemes. 28 | // All generated defaulters are covering - they call all nested defaulters. 29 | func RegisterDefaults(scheme *runtime.Scheme) error { 30 | return nil 31 | } 32 | -------------------------------------------------------------------------------- /pkg/controller.v1/common/scheduling.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package common 18 | 19 | import ( 20 | "context" 21 | "errors" 22 | "fmt" 23 | 24 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 25 | 26 | "github.com/google/go-cmp/cmp" 27 | log "github.com/sirupsen/logrus" 28 | policyapi "k8s.io/api/policy/v1beta1" 29 | k8serrors "k8s.io/apimachinery/pkg/api/errors" 30 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 | "k8s.io/apimachinery/pkg/util/intstr" 32 | "k8s.io/klog/v2" 33 | "sigs.k8s.io/controller-runtime/pkg/client" 34 | ) 35 | 36 | type FillPodGroupSpecFunc func(object metav1.Object) error 37 | 38 | func (jc *JobController) SyncPodGroup(job metav1.Object, specFunc FillPodGroupSpecFunc) (metav1.Object, error) { 39 | pgctl := jc.PodGroupControl 40 | 41 | // Check whether podGroup exists or not 42 | podGroup, err := pgctl.GetPodGroup(job.GetNamespace(), job.GetName()) 43 | if err == nil { 44 | // update podGroup for gang scheduling 45 | oldPodGroup := &podGroup 46 | if err = specFunc(podGroup); err != nil { 47 | return nil, fmt.Errorf("unable to fill the spec of PodGroup, '%v': %v", klog.KObj(podGroup), err) 48 | } 49 | if diff := cmp.Diff(oldPodGroup, podGroup); len(diff) != 0 { 50 | return podGroup, pgctl.UpdatePodGroup(podGroup.(client.Object)) 51 | } 52 | return podGroup, nil 53 | } else if client.IgnoreNotFound(err) != nil { 54 | return nil, fmt.Errorf("unable to get a PodGroup: %v", err) 55 | } else { 56 | // create podGroup for gang scheduling 57 | newPodGroup := pgctl.NewEmptyPodGroup() 58 | newPodGroup.SetName(job.GetName()) 59 | newPodGroup.SetNamespace(job.GetNamespace()) 60 | newPodGroup.SetAnnotations(job.GetAnnotations()) 61 | newPodGroup.SetOwnerReferences([]metav1.OwnerReference{*jc.GenOwnerReference(job)}) 62 | if err = specFunc(newPodGroup); err != nil { 63 | return nil, fmt.Errorf("unable to fill the spec of PodGroup, '%v': %v", klog.KObj(newPodGroup), err) 64 | } 65 | 66 | err = pgctl.CreatePodGroup(newPodGroup) 67 | if err != nil { 68 | return podGroup, fmt.Errorf("unable to create PodGroup: %v", err) 69 | } 70 | createdPodGroupsCount.Inc() 71 | } 72 | 73 | createdPodGroup, err := pgctl.GetPodGroup(job.GetNamespace(), job.GetName()) 74 | if err != nil { 75 | return nil, fmt.Errorf("unable to get PodGroup after success creation: %v", err) 76 | } 77 | 78 | return createdPodGroup, nil 79 | } 80 | 81 | // SyncPdb will create a PDB for gang scheduling. 82 | func (jc *JobController) SyncPdb(job metav1.Object, minAvailableReplicas int32) (*policyapi.PodDisruptionBudget, error) { 83 | // Check the pdb exist or not 84 | pdb, err := jc.KubeClientSet.PolicyV1beta1().PodDisruptionBudgets(job.GetNamespace()).Get(context.TODO(), job.GetName(), metav1.GetOptions{}) 85 | if err == nil || !k8serrors.IsNotFound(err) { 86 | if err == nil { 87 | err = errors.New(string(metav1.StatusReasonAlreadyExists)) 88 | } 89 | return pdb, err 90 | } 91 | 92 | // Create pdb for gang scheduling 93 | minAvailable := intstr.FromInt(int(minAvailableReplicas)) 94 | createPdb := &policyapi.PodDisruptionBudget{ 95 | ObjectMeta: metav1.ObjectMeta{ 96 | Name: job.GetName(), 97 | OwnerReferences: []metav1.OwnerReference{ 98 | *jc.GenOwnerReference(job), 99 | }, 100 | }, 101 | Spec: policyapi.PodDisruptionBudgetSpec{ 102 | MinAvailable: &minAvailable, 103 | Selector: &metav1.LabelSelector{ 104 | MatchLabels: map[string]string{ 105 | apiv1.JobNameLabel: job.GetName(), 106 | }, 107 | }, 108 | }, 109 | } 110 | createdPdb, err := jc.KubeClientSet.PolicyV1beta1().PodDisruptionBudgets(job.GetNamespace()).Create(context.TODO(), createPdb, metav1.CreateOptions{}) 111 | if err != nil { 112 | return createdPdb, fmt.Errorf("unable to create pdb: %v", err) 113 | } 114 | createdPDBCount.Inc() 115 | return createdPdb, nil 116 | } 117 | 118 | func (jc *JobController) DeletePodGroup(job metav1.Object) error { 119 | pgctl := jc.PodGroupControl 120 | 121 | // Check whether podGroup exists or not 122 | _, err := pgctl.GetPodGroup(job.GetNamespace(), job.GetName()) 123 | if err != nil && k8serrors.IsNotFound(err) { 124 | return nil 125 | } 126 | 127 | log.Infof("Deleting PodGroup %s", job.GetName()) 128 | 129 | // Delete podGroup 130 | err = pgctl.DeletePodGroup(job.GetNamespace(), job.GetName()) 131 | if err != nil { 132 | return fmt.Errorf("unable to delete PodGroup: %v", err) 133 | } 134 | deletedPodGroupsCount.Inc() 135 | return nil 136 | } 137 | 138 | func (jc *JobController) DeletePdb(job metav1.Object) error { 139 | // Check whether pdb exists or not 140 | _, err := jc.KubeClientSet.PolicyV1beta1().PodDisruptionBudgets(job.GetNamespace()).Get(context.TODO(), job.GetName(), metav1.GetOptions{}) 141 | if err != nil && k8serrors.IsNotFound(err) { 142 | return nil 143 | } 144 | 145 | msg := fmt.Sprintf("Deleting pdb %s", job.GetName()) 146 | log.Info(msg) 147 | 148 | if err := jc.KubeClientSet.PolicyV1beta1().PodDisruptionBudgets(job.GetNamespace()).Delete(context.TODO(), job.GetName(), metav1.DeleteOptions{}); err != nil { 149 | return fmt.Errorf("unable to delete pdb: %v", err) 150 | } 151 | deletedPDBCount.Inc() 152 | return nil 153 | } 154 | -------------------------------------------------------------------------------- /pkg/controller.v1/common/status.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 5 | "github.com/kubeflow/common/pkg/core" 6 | corev1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | // initializeReplicaStatuses initializes the ReplicaStatuses for replica. 10 | func initializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType) { 11 | core.InitializeReplicaStatuses(jobStatus, rtype) 12 | } 13 | 14 | // updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. 15 | func updateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) { 16 | core.UpdateJobReplicaStatuses(jobStatus, rtype, pod) 17 | } 18 | -------------------------------------------------------------------------------- /pkg/controller.v1/common/status_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 8 | "github.com/stretchr/testify/assert" 9 | corev1 "k8s.io/api/core/v1" 10 | metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | ) 12 | 13 | func TestUpdateJobReplicaStatuses(t *testing.T) { 14 | jobStatus := apiv1.JobStatus{} 15 | initializeReplicaStatuses(&jobStatus, "worker") 16 | _, ok := jobStatus.ReplicaStatuses["worker"] 17 | // assert ReplicaStatus for "worker" exists 18 | assert.True(t, ok) 19 | setStatusForTest(&jobStatus, "worker", 2, 3, 1, 1) 20 | // terminating pod should count as failed. 21 | assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(3)) 22 | assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Succeeded, int32(3)) 23 | assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Active, int32(1)) 24 | } 25 | 26 | func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active, terminating int32) { 27 | pod := corev1.Pod{ 28 | Status: corev1.PodStatus{}, 29 | } 30 | var i int32 31 | for i = 0; i < failed; i++ { 32 | pod.Status.Phase = corev1.PodFailed 33 | updateJobReplicaStatuses(jobStatus, rtype, &pod) 34 | } 35 | for i = 0; i < succeeded; i++ { 36 | pod.Status.Phase = corev1.PodSucceeded 37 | updateJobReplicaStatuses(jobStatus, rtype, &pod) 38 | } 39 | for i = 0; i < active; i++ { 40 | pod.Status.Phase = corev1.PodRunning 41 | updateJobReplicaStatuses(jobStatus, rtype, &pod) 42 | } 43 | for i = 0; i < terminating; i++ { 44 | pod.Status.Phase = corev1.PodRunning 45 | deletionTimestamp := metaV1.NewTime(time.Now()) 46 | pod.DeletionTimestamp = &deletionTimestamp 47 | updateJobReplicaStatuses(jobStatus, rtype, &pod) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pkg/controller.v1/common/util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common 16 | 17 | import ( 18 | "fmt" 19 | "sort" 20 | "strings" 21 | 22 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | log "github.com/sirupsen/logrus" 24 | v1 "k8s.io/api/core/v1" 25 | schedulingv1 "k8s.io/api/scheduling/v1" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | ) 28 | 29 | // ReplicasPriority is a slice of ReplicaPriority. 30 | type ReplicasPriority []ReplicaPriority 31 | 32 | type ReplicaPriority struct { 33 | priority int32 34 | 35 | apiv1.ReplicaSpec 36 | } 37 | 38 | func (p ReplicasPriority) Len() int { 39 | return len(p) 40 | } 41 | 42 | func (p ReplicasPriority) Less(i, j int) bool { 43 | return p[i].priority > p[j].priority 44 | } 45 | 46 | func (p ReplicasPriority) Swap(i, j int) { 47 | p[i], p[j] = p[j], p[i] 48 | } 49 | 50 | func GenGeneralName(jobName string, rtype string, index string) string { 51 | n := jobName + "-" + strings.ToLower(rtype) + "-" + index 52 | return strings.Replace(n, "/", "-", -1) 53 | } 54 | 55 | // RecheckDeletionTimestamp returns a CanAdopt() function to recheck deletion. 56 | // 57 | // The CanAdopt() function calls getObject() to fetch the latest value, 58 | // and denies adoption attempts if that object has a non-nil DeletionTimestamp. 59 | func RecheckDeletionTimestamp(getObject func() (metav1.Object, error)) func() error { 60 | return func() error { 61 | obj, err := getObject() 62 | if err != nil { 63 | return fmt.Errorf("can't recheck DeletionTimestamp: %v", err) 64 | } 65 | if obj.GetDeletionTimestamp() != nil { 66 | return fmt.Errorf("%v/%v has just been deleted at %v", obj.GetNamespace(), obj.GetName(), obj.GetDeletionTimestamp()) 67 | } 68 | return nil 69 | } 70 | } 71 | 72 | func MaxInt(x, y int) int { 73 | if x < y { 74 | return y 75 | } 76 | return x 77 | } 78 | 79 | func AddResourceList(list, req, limit v1.ResourceList) { 80 | for name, quantity := range req { 81 | 82 | if value, ok := list[name]; !ok { 83 | list[name] = quantity.DeepCopy() 84 | } else { 85 | value.Add(quantity) 86 | list[name] = value 87 | } 88 | } 89 | 90 | if req != nil { 91 | return 92 | } 93 | 94 | // If Requests is omitted for a container, 95 | // it defaults to Limits if that is explicitly specified. 96 | for name, quantity := range limit { 97 | if value, ok := list[name]; !ok { 98 | list[name] = quantity.DeepCopy() 99 | } else { 100 | value.Add(quantity) 101 | list[name] = value 102 | } 103 | } 104 | } 105 | 106 | type PriorityClassGetFunc func(string) (*schedulingv1.PriorityClass, error) 107 | 108 | func CalcPGMinResources(minMember int32, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, pcGetFunc PriorityClassGetFunc) *v1.ResourceList { 109 | var replicasPriority ReplicasPriority 110 | for t, replica := range replicas { 111 | rp := ReplicaPriority{0, *replica} 112 | pc := replica.Template.Spec.PriorityClassName 113 | 114 | priorityClass, err := pcGetFunc(pc) 115 | if err != nil || priorityClass == nil { 116 | log.Warnf("Ignore task %s priority class %s: %v", t, pc, err) 117 | } else { 118 | rp.priority = priorityClass.Value 119 | } 120 | 121 | replicasPriority = append(replicasPriority, rp) 122 | } 123 | 124 | sort.Sort(replicasPriority) 125 | 126 | minAvailableTasksRes := v1.ResourceList{} 127 | podCnt := int32(0) 128 | for _, task := range replicasPriority { 129 | if task.Replicas == nil { 130 | continue 131 | } 132 | 133 | for i := int32(0); i < *task.Replicas; i++ { 134 | if podCnt >= minMember { 135 | break 136 | } 137 | podCnt++ 138 | for _, c := range task.Template.Spec.Containers { 139 | AddResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits) 140 | } 141 | } 142 | } 143 | 144 | return &minAvailableTasksRes 145 | } 146 | -------------------------------------------------------------------------------- /pkg/controller.v1/common/util_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common 16 | 17 | import ( 18 | "testing" 19 | 20 | "github.com/stretchr/testify/assert" 21 | 22 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | ) 24 | 25 | func TestGenGeneralName(t *testing.T) { 26 | tcs := []struct { 27 | index string 28 | key string 29 | replicaType apiv1.ReplicaType 30 | expectedName string 31 | }{ 32 | { 33 | index: "1", 34 | key: "1/2/3/4/5", 35 | replicaType: "worker", 36 | expectedName: "1-2-3-4-5-worker-1", 37 | }, 38 | { 39 | index: "1", 40 | key: "1/2/3/4/5", 41 | replicaType: "WORKER", 42 | expectedName: "1-2-3-4-5-worker-1", 43 | }, 44 | } 45 | 46 | for _, tc := range tcs { 47 | actual := GenGeneralName(tc.key, string(tc.replicaType), tc.index) 48 | if actual != tc.expectedName { 49 | t.Errorf("Expected name %s, got %s", tc.expectedName, actual) 50 | } 51 | } 52 | } 53 | 54 | func TestMaxInt(t *testing.T) { 55 | type testCase struct { 56 | x int 57 | y int 58 | expectedMax int 59 | } 60 | var testCases = []testCase{ 61 | { 62 | x: 10, 63 | y: 20, 64 | expectedMax: 20, 65 | }, 66 | { 67 | x: 20, 68 | y: 10, 69 | expectedMax: 20, 70 | }, 71 | { 72 | x: 5, 73 | y: 5, 74 | expectedMax: 5, 75 | }, 76 | } 77 | 78 | for _, tc := range testCases { 79 | result := MaxInt(tc.x, tc.y) 80 | assert.Equal(t, tc.expectedMax, result) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /pkg/controller.v1/control/pod_control_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package control 16 | 17 | import ( 18 | "encoding/json" 19 | "net/http/httptest" 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | corev1 "k8s.io/api/core/v1" 24 | apiequality "k8s.io/apimachinery/pkg/api/equality" 25 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | "k8s.io/apimachinery/pkg/runtime" 27 | clientset "k8s.io/client-go/kubernetes" 28 | clientscheme "k8s.io/client-go/kubernetes/scheme" 29 | restclient "k8s.io/client-go/rest" 30 | "k8s.io/client-go/tools/record" 31 | utiltesting "k8s.io/client-go/util/testing" 32 | 33 | testutilv1 "github.com/kubeflow/common/test_job/test_util/v1" 34 | ) 35 | 36 | func TestCreatePods(t *testing.T) { 37 | ns := metav1.NamespaceDefault 38 | body := runtime.EncodeOrDie( 39 | clientscheme.Codecs.LegacyCodec(corev1.SchemeGroupVersion), 40 | &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "empty_pod"}}) 41 | fakeHandler := utiltesting.FakeHandler{ 42 | StatusCode: 200, 43 | ResponseBody: body, 44 | } 45 | testServer := httptest.NewServer(&fakeHandler) 46 | defer testServer.Close() 47 | k8sClient := clientset.NewForConfigOrDie(&restclient.Config{Host: testServer.URL, ContentConfig: restclient.ContentConfig{GroupVersion: &corev1.SchemeGroupVersion}}) 48 | 49 | podControl := RealPodControl{ 50 | KubeClient: k8sClient, 51 | Recorder: &record.FakeRecorder{}, 52 | } 53 | 54 | testJob := testutilv1.NewTestJob(1) 55 | 56 | testName := "pod-name" 57 | podTemplate := testutilv1.NewTestReplicaSpecTemplate() 58 | podTemplate.Name = testName 59 | podTemplate.Labels = testutilv1.GenLabels(testJob.Name) 60 | podTemplate.SetOwnerReferences([]metav1.OwnerReference{}) 61 | 62 | // Make sure createReplica sends a POST to the apiserver with a pod from the controllers pod template 63 | err := podControl.CreatePods(ns, &podTemplate, testJob) 64 | assert.NoError(t, err, "unexpected error: %v", err) 65 | 66 | expectedPod := corev1.Pod{ 67 | ObjectMeta: metav1.ObjectMeta{ 68 | Labels: testutilv1.GenLabels(testJob.Name), 69 | Name: testName, 70 | }, 71 | Spec: podTemplate.Spec, 72 | } 73 | fakeHandler.ValidateRequest(t, 74 | "/api/v1/namespaces/default/pods", "POST", nil) 75 | var actualPod = &corev1.Pod{} 76 | err = json.Unmarshal([]byte(fakeHandler.RequestBody), actualPod) 77 | assert.NoError(t, err, "unexpected error: %v", err) 78 | assert.True(t, apiequality.Semantic.DeepDerivative(&expectedPod, actualPod), 79 | "Body: %s", fakeHandler.RequestBody) 80 | } 81 | -------------------------------------------------------------------------------- /pkg/controller.v1/control/service_control_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package control 16 | 17 | import ( 18 | "encoding/json" 19 | "net/http/httptest" 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | v1 "k8s.io/api/core/v1" 24 | apiequality "k8s.io/apimachinery/pkg/api/equality" 25 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | "k8s.io/apimachinery/pkg/runtime" 27 | clientset "k8s.io/client-go/kubernetes" 28 | clientscheme "k8s.io/client-go/kubernetes/scheme" 29 | restclient "k8s.io/client-go/rest" 30 | "k8s.io/client-go/tools/record" 31 | utiltesting "k8s.io/client-go/util/testing" 32 | 33 | testutilv1 "github.com/kubeflow/common/test_job/test_util/v1" 34 | ) 35 | 36 | func TestCreateService(t *testing.T) { 37 | ns := metav1.NamespaceDefault 38 | body := runtime.EncodeOrDie( 39 | clientscheme.Codecs.LegacyCodec(v1.SchemeGroupVersion), 40 | &v1.Service{ObjectMeta: metav1.ObjectMeta{Name: "empty_service"}}) 41 | fakeHandler := utiltesting.FakeHandler{ 42 | StatusCode: 200, 43 | ResponseBody: body, 44 | } 45 | testServer := httptest.NewServer(&fakeHandler) 46 | defer testServer.Close() 47 | clientset := clientset.NewForConfigOrDie(&restclient.Config{ 48 | Host: testServer.URL, 49 | ContentConfig: restclient.ContentConfig{ 50 | GroupVersion: &v1.SchemeGroupVersion, 51 | }, 52 | }) 53 | 54 | serviceControl := RealServiceControl{ 55 | KubeClient: clientset, 56 | Recorder: &record.FakeRecorder{}, 57 | } 58 | 59 | testJob := testutilv1.NewTestJob(1) 60 | 61 | testName := "service-name" 62 | service := testutilv1.NewBaseService(testName, testJob, t) 63 | service.SetOwnerReferences([]metav1.OwnerReference{}) 64 | 65 | // Make sure createReplica sends a POST to the apiserver with a pod from the controllers pod template 66 | err := serviceControl.CreateServices(ns, service, testJob) 67 | assert.NoError(t, err, "unexpected error: %v", err) 68 | 69 | expectedService := v1.Service{ 70 | ObjectMeta: metav1.ObjectMeta{ 71 | Labels: testutilv1.GenLabels(testJob.Name), 72 | Name: testName, 73 | Namespace: ns, 74 | }, 75 | } 76 | fakeHandler.ValidateRequest(t, 77 | "/api/v1/namespaces/default/services", "POST", nil) 78 | var actualService = &v1.Service{} 79 | err = json.Unmarshal([]byte(fakeHandler.RequestBody), actualService) 80 | assert.NoError(t, err, "unexpected error: %v", err) 81 | assert.True(t, apiequality.Semantic.DeepDerivative(&expectedService, actualService), 82 | "Body: %s", fakeHandler.RequestBody) 83 | } 84 | 85 | func TestCreateServicesWithControllerRef(t *testing.T) { 86 | ns := metav1.NamespaceDefault 87 | body := runtime.EncodeOrDie( 88 | clientscheme.Codecs.LegacyCodec(v1.SchemeGroupVersion), 89 | &v1.Service{ObjectMeta: metav1.ObjectMeta{Name: "empty_service"}}) 90 | fakeHandler := utiltesting.FakeHandler{ 91 | StatusCode: 200, 92 | ResponseBody: body, 93 | } 94 | testServer := httptest.NewServer(&fakeHandler) 95 | defer testServer.Close() 96 | clientset := clientset.NewForConfigOrDie(&restclient.Config{ 97 | Host: testServer.URL, 98 | ContentConfig: restclient.ContentConfig{ 99 | GroupVersion: &v1.SchemeGroupVersion, 100 | }, 101 | }) 102 | 103 | serviceControl := RealServiceControl{ 104 | KubeClient: clientset, 105 | Recorder: &record.FakeRecorder{}, 106 | } 107 | 108 | testJob := testutilv1.NewTestJob(1) 109 | 110 | testName := "service-name" 111 | service := testutilv1.NewBaseService(testName, testJob, t) 112 | service.SetOwnerReferences([]metav1.OwnerReference{}) 113 | 114 | ownerRef := testutilv1.GenOwnerReference(testJob) 115 | 116 | // Make sure createReplica sends a POST to the apiserver with a pod from the controllers pod template 117 | err := serviceControl.CreateServicesWithControllerRef(ns, service, testJob, ownerRef) 118 | assert.NoError(t, err, "unexpected error: %v", err) 119 | 120 | expectedService := v1.Service{ 121 | ObjectMeta: metav1.ObjectMeta{ 122 | Labels: testutilv1.GenLabels(testJob.Name), 123 | Name: testName, 124 | Namespace: ns, 125 | OwnerReferences: []metav1.OwnerReference{*ownerRef}, 126 | }, 127 | } 128 | fakeHandler.ValidateRequest(t, 129 | "/api/v1/namespaces/default/services", "POST", nil) 130 | var actualService = &v1.Service{} 131 | err = json.Unmarshal([]byte(fakeHandler.RequestBody), actualService) 132 | assert.NoError(t, err, "unexpected error: %v", err) 133 | assert.True(t, apiequality.Semantic.DeepDerivative(&expectedService, actualService), 134 | "Body: %s", fakeHandler.RequestBody) 135 | } 136 | -------------------------------------------------------------------------------- /pkg/controller.v1/control/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package control 18 | 19 | import ( 20 | "fmt" 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | "k8s.io/apimachinery/pkg/runtime" 24 | ) 25 | 26 | func ValidateControllerRef(controllerRef *metav1.OwnerReference) error { 27 | if controllerRef == nil { 28 | return fmt.Errorf("controllerRef is nil") 29 | } 30 | if len(controllerRef.APIVersion) == 0 { 31 | return fmt.Errorf("controllerRef has empty APIVersion") 32 | } 33 | if len(controllerRef.Kind) == 0 { 34 | return fmt.Errorf("controllerRef has empty Kind") 35 | } 36 | if controllerRef.Controller == nil || !*controllerRef.Controller { 37 | return fmt.Errorf("controllerRef.Controller is not set to true") 38 | } 39 | if controllerRef.BlockOwnerDeletion == nil || !*controllerRef.BlockOwnerDeletion { 40 | return fmt.Errorf("controllerRef.BlockOwnerDeletion is not set") 41 | } 42 | return nil 43 | } 44 | 45 | func GetServiceFromTemplate(template *v1.Service, parentObject runtime.Object, controllerRef *metav1.OwnerReference) (*v1.Service, error) { 46 | service := template.DeepCopy() 47 | if controllerRef != nil { 48 | service.OwnerReferences = append(service.OwnerReferences, *controllerRef) 49 | } 50 | return service, nil 51 | } 52 | -------------------------------------------------------------------------------- /pkg/controller.v1/expectation/expectation_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package expectation 18 | 19 | import ( 20 | "sync" 21 | "testing" 22 | "time" 23 | 24 | "github.com/stretchr/testify/assert" 25 | v1 "k8s.io/api/core/v1" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | "k8s.io/apimachinery/pkg/util/uuid" 28 | "k8s.io/client-go/tools/cache" 29 | clock "k8s.io/utils/clock/testing" 30 | ) 31 | 32 | var ( 33 | // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. 34 | // IndexerInformer uses a delta queue, therefore for deletes we have to use this 35 | // key function but it should be just fine for non delete events. 36 | KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc 37 | ) 38 | 39 | // NewFakeControllerExpectationsLookup creates a fake store for PodExpectations. 40 | func NewFakeControllerExpectationsLookup(ttl time.Duration) (*ControllerExpectations, *clock.FakeClock) { 41 | fakeTime := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC) 42 | fakeClock := clock.NewFakeClock(fakeTime) 43 | ttlPolicy := &cache.TTLPolicy{TTL: ttl, Clock: fakeClock} 44 | ttlStore := cache.NewFakeExpirationStore( 45 | ExpKeyFunc, nil, ttlPolicy, fakeClock) 46 | return &ControllerExpectations{ttlStore}, fakeClock 47 | } 48 | 49 | func newReplicationController(replicas int) *v1.ReplicationController { 50 | rc := &v1.ReplicationController{ 51 | TypeMeta: metav1.TypeMeta{APIVersion: "v1"}, 52 | ObjectMeta: metav1.ObjectMeta{ 53 | UID: uuid.NewUUID(), 54 | Name: "foobar", 55 | Namespace: metav1.NamespaceDefault, 56 | ResourceVersion: "18", 57 | }, 58 | Spec: v1.ReplicationControllerSpec{ 59 | Replicas: func() *int32 { i := int32(replicas); return &i }(), 60 | Selector: map[string]string{"foo": "bar"}, 61 | Template: &v1.PodTemplateSpec{ 62 | ObjectMeta: metav1.ObjectMeta{ 63 | Labels: map[string]string{ 64 | "name": "foo", 65 | "type": "production", 66 | }, 67 | }, 68 | Spec: v1.PodSpec{ 69 | Containers: []v1.Container{ 70 | { 71 | Image: "foo/bar", 72 | TerminationMessagePath: v1.TerminationMessagePathDefault, 73 | ImagePullPolicy: v1.PullIfNotPresent, 74 | }, 75 | }, 76 | RestartPolicy: v1.RestartPolicyAlways, 77 | DNSPolicy: v1.DNSDefault, 78 | NodeSelector: map[string]string{ 79 | "baz": "blah", 80 | }, 81 | }, 82 | }, 83 | }, 84 | } 85 | return rc 86 | } 87 | 88 | func TestControllerExpectations(t *testing.T) { 89 | ttl := 30 * time.Second 90 | e, fakeClock := NewFakeControllerExpectationsLookup(ttl) 91 | // In practice we can't really have add and delete expectations since we only either create or 92 | // delete replicas in one rc pass, and the rc goes to sleep soon after until the expectations are 93 | // either fulfilled or timeout. 94 | adds, dels := 10, 30 95 | rc := newReplicationController(1) 96 | 97 | // RC fires off adds and deletes at apiserver, then sets expectations 98 | rcKey, err := KeyFunc(rc) 99 | assert.NoError(t, err, "Couldn't get key for object %#v: %v", rc, err) 100 | 101 | err = e.SetExpectations(rcKey, adds, dels) 102 | assert.NoError(t, err, "Could not register expectations for rc, err: %v", err) 103 | 104 | var wg sync.WaitGroup 105 | for i := 0; i < adds+1; i++ { 106 | wg.Add(1) 107 | go func() { 108 | // In prod this can happen either because of a failed create by the rc 109 | // or after having observed a create via informer 110 | e.CreationObserved(rcKey) 111 | wg.Done() 112 | }() 113 | } 114 | wg.Wait() 115 | 116 | // There are still delete expectations 117 | assert.False(t, e.SatisfiedExpectations(rcKey), "Rc will sync before expectations are met") 118 | 119 | for i := 0; i < dels+1; i++ { 120 | wg.Add(1) 121 | go func() { 122 | e.DeletionObserved(rcKey) 123 | wg.Done() 124 | }() 125 | } 126 | wg.Wait() 127 | 128 | // Expectations have been surpassed 129 | podExp, exists, err := e.GetExpectations(rcKey) 130 | assert.NoError(t, err, "Could not get expectations for rc, exists %v and err %v", exists, err) 131 | assert.True(t, exists, "Could not get expectations for rc, exists %v and err %v", exists, err) 132 | 133 | add, del := podExp.GetExpectations() 134 | assert.Equal(t, int64(-1), add, "Unexpected pod expectations %#v", podExp) 135 | assert.Equal(t, int64(-1), del, "Unexpected pod expectations %#v", podExp) 136 | assert.True(t, e.SatisfiedExpectations(rcKey), "Expectations are met but the rc will not sync") 137 | 138 | // Next round of rc sync, old expectations are cleared 139 | err = e.SetExpectations(rcKey, 1, 2) 140 | assert.NoError(t, err, "Could not register expectations for rc, err %v", err) 141 | podExp, exists, err = e.GetExpectations(rcKey) 142 | assert.NoError(t, err, "Could not get expectations for rc, exists %v and err %v", exists, err) 143 | assert.True(t, exists, "Could not get expectations for rc, exists %v and err %v", exists, err) 144 | add, del = podExp.GetExpectations() 145 | 146 | assert.Equal(t, int64(1), add, "Unexpected pod expectations %#v", podExp) 147 | assert.Equal(t, int64(2), del, "Unexpected pod expectations %#v", podExp) 148 | 149 | // Expectations have expired because of ttl 150 | fakeClock.Step(ttl + 1) 151 | assert.True(t, e.SatisfiedExpectations(rcKey), 152 | "Expectations should have expired but didn't") 153 | } 154 | -------------------------------------------------------------------------------- /pkg/controller.v1/expectation/util.go: -------------------------------------------------------------------------------- 1 | package expectation 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | // GenExpectationPodsKey generates an expectation key for pods of a job 8 | func GenExpectationPodsKey(jobKey string, replicaType string) string { 9 | return jobKey + "/" + strings.ToLower(replicaType) + "/pods" 10 | } 11 | 12 | // GenExpectationPodsKey generates an expectation key for services of a job 13 | func GenExpectationServicesKey(jobKey string, replicaType string) string { 14 | return jobKey + "/" + strings.ToLower(replicaType) + "/services" 15 | } 16 | -------------------------------------------------------------------------------- /pkg/core/job.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package core 18 | 19 | import ( 20 | "sort" 21 | "strings" 22 | "time" 23 | 24 | log "github.com/sirupsen/logrus" 25 | 26 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 27 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 | 29 | v1 "k8s.io/api/core/v1" 30 | "k8s.io/apimachinery/pkg/runtime" 31 | "k8s.io/client-go/tools/record" 32 | ) 33 | 34 | // RecordAbnormalPods records the active pod whose latest condition is not in True status. 35 | func RecordAbnormalPods(activePods []*v1.Pod, object runtime.Object, recorder record.EventRecorder) { 36 | for _, pod := range activePods { 37 | // If the pod starts running, should checks the container statuses rather than the conditions. 38 | recordContainerStatus := func(status *v1.ContainerStatus) { 39 | if status.State.Terminated != nil && status.State.Terminated.ExitCode != 0 { 40 | terminated := status.State.Terminated 41 | recorder.Eventf(object, v1.EventTypeWarning, terminated.Reason, 42 | "Error pod %s container %s exitCode: %d terminated message: %s", 43 | pod.Name, status.Name, terminated.ExitCode, terminated.Message) 44 | } 45 | // The terminated state and waiting state don't simultaneously exists, checks them at the same time. 46 | if status.State.Waiting != nil && status.State.Waiting.Message != "" { 47 | wait := status.State.Waiting 48 | recorder.Eventf(object, v1.EventTypeWarning, wait.Reason, 49 | "Error pod %s container %s waiting message: %s", pod.Name, status.Name, wait.Message) 50 | } 51 | } 52 | if len(pod.Status.ContainerStatuses) != 0 { 53 | for _, status := range pod.Status.ContainerStatuses { 54 | recordContainerStatus(&status) 55 | } 56 | // If the pod has container status info, that means the init container statuses are normal. 57 | continue 58 | } 59 | if len(pod.Status.InitContainerStatuses) != 0 { 60 | for _, status := range pod.Status.InitContainerStatuses { 61 | recordContainerStatus(&status) 62 | } 63 | continue 64 | } 65 | if len(pod.Status.Conditions) == 0 { 66 | continue 67 | } 68 | // Should not modify the original pod which is stored in the informer cache. 69 | status := pod.Status.DeepCopy() 70 | sort.Slice(status.Conditions, func(i, j int) bool { 71 | return status.Conditions[i].LastTransitionTime.After(status.Conditions[j].LastTransitionTime.Time) 72 | }) 73 | condition := status.Conditions[0] 74 | if condition.Status == v1.ConditionTrue { 75 | continue 76 | } 77 | recorder.Eventf(object, v1.EventTypeWarning, condition.Reason, "Error pod %s condition message: %s", pod.Name, condition.Message) 78 | } 79 | } 80 | 81 | // PastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if it is exceeded. 82 | func PastActiveDeadline(runPolicy *apiv1.RunPolicy, jobStatus apiv1.JobStatus) bool { 83 | if runPolicy.ActiveDeadlineSeconds == nil || jobStatus.StartTime == nil { 84 | return false 85 | } 86 | now := metav1.Now() 87 | start := jobStatus.StartTime.Time 88 | duration := now.Time.Sub(start) 89 | allowedDuration := time.Duration(*runPolicy.ActiveDeadlineSeconds) * time.Second 90 | return duration >= allowedDuration 91 | } 92 | 93 | // PastBackoffLimit checks if container restartCounts sum exceeds BackoffLimit 94 | // this method applies only to pods when restartPolicy is one of OnFailure, Always or ExitCode 95 | func PastBackoffLimit(jobName string, runPolicy *apiv1.RunPolicy, 96 | replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, pods []*v1.Pod, 97 | podFilterFunc func(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error)) (bool, error) { 98 | if runPolicy.BackoffLimit == nil { 99 | return false, nil 100 | } 101 | result := int32(0) 102 | for rtype, spec := range replicas { 103 | if spec.RestartPolicy != apiv1.RestartPolicyOnFailure && spec.RestartPolicy != apiv1.RestartPolicyAlways && spec.RestartPolicy != apiv1.RestartPolicyExitCode { 104 | log.Warnf("The restart policy of replica %v of the job %v is not OnFailure, Always or ExitCode. Not counted in backoff limit.", rtype, jobName) 105 | continue 106 | } 107 | // Convert ReplicaType to lower string. 108 | rt := strings.ToLower(string(rtype)) 109 | pods, err := podFilterFunc(pods, rt) 110 | if err != nil { 111 | return false, err 112 | } 113 | for i := range pods { 114 | po := pods[i] 115 | if po.Status.Phase != v1.PodRunning { 116 | continue 117 | } 118 | for j := range po.Status.InitContainerStatuses { 119 | stat := po.Status.InitContainerStatuses[j] 120 | result += stat.RestartCount 121 | } 122 | for j := range po.Status.ContainerStatuses { 123 | stat := po.Status.ContainerStatuses[j] 124 | result += stat.RestartCount 125 | } 126 | } 127 | } 128 | 129 | if *runPolicy.BackoffLimit == 0 { 130 | return result > 0, nil 131 | } 132 | return result >= *runPolicy.BackoffLimit, nil 133 | } 134 | -------------------------------------------------------------------------------- /pkg/core/pod.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package core 18 | 19 | import ( 20 | utillabels "github.com/kubeflow/common/pkg/util/labels" 21 | 22 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | log "github.com/sirupsen/logrus" 24 | v1 "k8s.io/api/core/v1" 25 | "k8s.io/apimachinery/pkg/labels" 26 | ) 27 | 28 | // FilterPodsForReplicaType returns pods belong to a replicaType. 29 | func FilterPodsForReplicaType(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error) { 30 | var result []*v1.Pod 31 | 32 | selector := labels.SelectorFromValidatedSet(labels.Set{ 33 | apiv1.ReplicaTypeLabel: replicaType, 34 | }) 35 | 36 | for _, pod := range pods { 37 | set := labels.Set(pod.Labels) 38 | if !selector.Matches(set) { 39 | continue 40 | } 41 | result = append(result, pod) 42 | } 43 | return result, nil 44 | } 45 | 46 | // GetPodSlices returns a slice, which element is the slice of pod. 47 | // It gives enough information to caller to make decision to up/down scale resources. 48 | func GetPodSlices(pods []*v1.Pod, replicas int, logger *log.Entry) [][]*v1.Pod { 49 | podSlices := make([][]*v1.Pod, CalculatePodSliceSize(pods, replicas)) 50 | for _, pod := range pods { 51 | index, err := utillabels.ReplicaIndex(pod.Labels) 52 | if err != nil { 53 | logger.Warningf("Error obtaining replica index from Pod %s/%s: %v", pod.Namespace, pod.Name, err) 54 | continue 55 | } 56 | if index < 0 || index >= replicas { 57 | logger.Warningf("The label index is not expected: %d, pod: %s/%s", index, pod.Namespace, pod.Name) 58 | } 59 | 60 | podSlices[index] = append(podSlices[index], pod) 61 | } 62 | return podSlices 63 | } 64 | 65 | // CalculatePodSliceSize compare max pod index with desired replicas and return larger size 66 | func CalculatePodSliceSize(pods []*v1.Pod, replicas int) int { 67 | size := 0 68 | for _, pod := range pods { 69 | index, err := utillabels.ReplicaIndex(pod.Labels) 70 | if err != nil { 71 | continue 72 | } 73 | size = MaxInt(size, index) 74 | } 75 | 76 | // size comes from index, need to +1 to indicate real size 77 | return MaxInt(size+1, replicas) 78 | } 79 | 80 | // SetRestartPolicy check the RestartPolicy defined in job spec and overwrite RestartPolicy in podTemplate if necessary 81 | func SetRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *apiv1.ReplicaSpec) { 82 | // This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec 83 | if spec.RestartPolicy == apiv1.RestartPolicyExitCode { 84 | podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicyNever 85 | } else { 86 | podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /pkg/core/service.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package core 18 | 19 | import ( 20 | "fmt" 21 | 22 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | utillabels "github.com/kubeflow/common/pkg/util/labels" 24 | log "github.com/sirupsen/logrus" 25 | v1 "k8s.io/api/core/v1" 26 | "k8s.io/apimachinery/pkg/labels" 27 | ) 28 | 29 | // FilterServicesForReplicaType returns service belong to a replicaType. 30 | func FilterServicesForReplicaType(services []*v1.Service, replicaType string) ([]*v1.Service, error) { 31 | var result []*v1.Service 32 | 33 | selector := labels.SelectorFromValidatedSet(labels.Set{ 34 | apiv1.ReplicaTypeLabel: replicaType, 35 | }) 36 | 37 | for _, service := range services { 38 | set := labels.Set(service.Labels) 39 | if !selector.Matches(set) { 40 | continue 41 | } 42 | result = append(result, service) 43 | } 44 | return result, nil 45 | } 46 | 47 | // GetServiceSlices returns a slice, which element is the slice of service. 48 | // Assume the return object is serviceSlices, then serviceSlices[i] is an 49 | // array of pointers to services corresponding to Services for replica i. 50 | func GetServiceSlices(services []*v1.Service, replicas int, logger *log.Entry) [][]*v1.Service { 51 | serviceSlices := make([][]*v1.Service, CalculateServiceSliceSize(services, replicas)) 52 | for _, service := range services { 53 | index, err := utillabels.ReplicaIndex(service.Labels) 54 | if err != nil { 55 | logger.Warningf("Error obtaining index for service %s/%s: %v", service.Namespace, service.Name, err) 56 | continue 57 | } 58 | if index < 0 || index >= replicas { 59 | logger.Warningf("The label index is not expected: %d, service: %s/%s", index, service.Namespace, service.Name) 60 | } 61 | 62 | serviceSlices[index] = append(serviceSlices[index], service) 63 | } 64 | return serviceSlices 65 | } 66 | 67 | // CalculateServiceSliceSize compare max pod index with desired replicas and return larger size 68 | func CalculateServiceSliceSize(services []*v1.Service, replicas int) int { 69 | size := 0 70 | for _, svc := range services { 71 | index, err := utillabels.ReplicaIndex(svc.Labels) 72 | if err != nil { 73 | continue 74 | } 75 | size = MaxInt(size, index) 76 | } 77 | 78 | // size comes from index, need to +1 to indicate real size 79 | return MaxInt(size+1, replicas) 80 | } 81 | 82 | // GetPortsFromJob gets the ports of job container. Port could be nil, if distributed communication strategy doesn't need and no other ports that need to be exposed. 83 | func GetPortsFromJob(spec *apiv1.ReplicaSpec, defaultContainerName string) (map[string]int32, error) { 84 | ports := make(map[string]int32) 85 | 86 | containers := spec.Template.Spec.Containers 87 | for _, container := range containers { 88 | if container.Name == defaultContainerName { 89 | containerPorts := container.Ports 90 | if len(containerPorts) == 0 { 91 | return nil, nil 92 | } 93 | for _, port := range containerPorts { 94 | ports[port.Name] = port.ContainerPort 95 | } 96 | return ports, nil 97 | } 98 | } 99 | 100 | return nil, fmt.Errorf("failed to find the port") 101 | } 102 | -------------------------------------------------------------------------------- /pkg/core/status.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package core 18 | 19 | import ( 20 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 21 | corev1 "k8s.io/api/core/v1" 22 | ) 23 | 24 | // InitializeReplicaStatuses initializes the ReplicaStatuses for replica. 25 | func InitializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType) { 26 | if jobStatus.ReplicaStatuses == nil { 27 | jobStatus.ReplicaStatuses = make(map[apiv1.ReplicaType]*apiv1.ReplicaStatus) 28 | } 29 | 30 | jobStatus.ReplicaStatuses[rtype] = &apiv1.ReplicaStatus{} 31 | } 32 | 33 | // UpdateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. 34 | func UpdateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) { 35 | switch pod.Status.Phase { 36 | case corev1.PodRunning: 37 | if pod.DeletionTimestamp != nil { 38 | // when node is not ready, the pod will be in terminating state. 39 | // Count deleted Pods as failures to account for orphan Pods that 40 | // never have a chance to reach the Failed phase. 41 | jobStatus.ReplicaStatuses[rtype].Failed++ 42 | } else { 43 | jobStatus.ReplicaStatuses[rtype].Active++ 44 | } 45 | case corev1.PodSucceeded: 46 | jobStatus.ReplicaStatuses[rtype].Succeeded++ 47 | case corev1.PodFailed: 48 | jobStatus.ReplicaStatuses[rtype].Failed++ 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pkg/core/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package core 18 | 19 | import ( 20 | "strings" 21 | ) 22 | 23 | func MaxInt(x, y int) int { 24 | if x < y { 25 | return y 26 | } 27 | return x 28 | } 29 | 30 | func GenGeneralName(jobName string, rtype string, index string) string { 31 | n := jobName + "-" + strings.ToLower(rtype) + "-" + index 32 | return strings.Replace(n, "/", "-", -1) 33 | } 34 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/README.md: -------------------------------------------------------------------------------- 1 | ## Reconciler.v1 2 | 3 | This is package providing most functionalities in `pkg/controller.v1` in the form of [reconciler](https://book.kubebuilder.io/cronjob-tutorial/controller-overview.html). 4 | 5 | To use the reconciler, following methods must be overridden according to the APIs the reconciler handles. 6 | 7 | ```go 8 | // GetJob returns the job that matches the request 9 | func (r *JobReconciler) GetJob(ctx context.Context, req ctrl.Request) (client.Object, error) 10 | 11 | // ExtractReplicasSpec extracts the ReplicasSpec map from this job 12 | func (r *JobReconciler) ExtractReplicasSpec(job client.Object) (map[commonv1.ReplicaType]*commonv1.ReplicaSpec, error) 13 | 14 | // ExtractRunPolicy extracts the RunPolicy from this job 15 | func (r *JobReconciler) ExtractRunPolicy(job client.Object) (*commonv1.RunPolicy, error) 16 | 17 | // ExtractJobStatus extracts the JobStatus from this job 18 | func (r *JobReconciler) ExtractJobStatus(job client.Object) (*commonv1.JobStatus, error) 19 | 20 | // IsMasterRole checks if Pod is the master Pod 21 | func (r *JobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool 22 | ``` 23 | 24 | A simple example can be found at `test_job/reconciler.v1/test_job/test_job_reconciler.go`. 25 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/gang.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common 16 | 17 | import ( 18 | "sigs.k8s.io/controller-runtime/pkg/client" 19 | ) 20 | 21 | // BaseGangReconciler defines a basic gang reconciler 22 | type BaseGangReconciler struct { 23 | Enabled bool 24 | } 25 | 26 | // GangSchedulingEnabled returns if gang-scheduling is enabled for all jobs 27 | func (r *BaseGangReconciler) GangSchedulingEnabled() bool { 28 | return r.Enabled 29 | } 30 | 31 | // GetPodGroupName returns the name of PodGroup for this job 32 | func (r *BaseGangReconciler) GetPodGroupName(job client.Object) string { 33 | return job.GetName() 34 | } 35 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/pod_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common_test 16 | 17 | import ( 18 | "testing" 19 | 20 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 21 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 22 | "github.com/kubeflow/common/test_job/reconciler.v1/test_job" 23 | testutilv1 "github.com/kubeflow/common/test_job/test_util/v1" 24 | 25 | corev1 "k8s.io/api/core/v1" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | ) 28 | 29 | func TestGenPodName(t *testing.T) { 30 | type tc struct { 31 | testJob *testjobv1.TestJob 32 | testRType string 33 | testIndex string 34 | expectedName string 35 | } 36 | testCase := []tc{ 37 | func() tc { 38 | tj := testutilv1.NewTestJob(1) 39 | tj.SetName("hello-world") 40 | return tc{ 41 | testJob: tj, 42 | testRType: string(testjobv1.TestReplicaTypeWorker), 43 | testIndex: "1", 44 | expectedName: "hello-world-worker-1", 45 | } 46 | }(), 47 | } 48 | 49 | testReconciler := test_job.NewTestReconciler() 50 | 51 | for _, c := range testCase { 52 | na := testReconciler.GenPodName(c.testJob.GetName(), c.testRType, c.testIndex) 53 | if na != c.expectedName { 54 | t.Errorf("Expected %s, got %s", c.expectedName, na) 55 | } 56 | } 57 | } 58 | 59 | func PodInSlice(pod *corev1.Pod, pods []*corev1.Pod) bool { 60 | for _, p := range pods { 61 | if p.GetNamespace() == pod.GetNamespace() && p.GetName() == pod.GetName() { 62 | return true 63 | } 64 | } 65 | return false 66 | } 67 | 68 | func TestFilterPodsForReplicaType(t *testing.T) { 69 | type tc struct { 70 | testPods []*corev1.Pod 71 | testRType string 72 | expectedPods []*corev1.Pod 73 | } 74 | testCase := []tc{ 75 | func() tc { 76 | tj := testutilv1.NewTestJob(3) 77 | tj.SetName("hello-world") 78 | 79 | pod0 := &corev1.Pod{ 80 | ObjectMeta: metav1.ObjectMeta{ 81 | Name: "pod0", 82 | Namespace: "default", 83 | Labels: map[string]string{ 84 | commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeMaster), 85 | }, 86 | }, 87 | Spec: corev1.PodSpec{}, 88 | Status: corev1.PodStatus{}, 89 | } 90 | 91 | pod1 := &corev1.Pod{ 92 | ObjectMeta: metav1.ObjectMeta{ 93 | Name: "pod1", 94 | Namespace: "default", 95 | Labels: map[string]string{ 96 | commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), 97 | }, 98 | }, 99 | Spec: corev1.PodSpec{}, 100 | Status: corev1.PodStatus{}, 101 | } 102 | 103 | pod2 := &corev1.Pod{ 104 | ObjectMeta: metav1.ObjectMeta{ 105 | Name: "pod2", 106 | Namespace: "default", 107 | Labels: map[string]string{ 108 | commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), 109 | }, 110 | }, 111 | Spec: corev1.PodSpec{}, 112 | Status: corev1.PodStatus{}, 113 | } 114 | 115 | allPods := []*corev1.Pod{pod0, pod1, pod2} 116 | filteredPods := []*corev1.Pod{pod1, pod2} 117 | 118 | return tc{ 119 | testPods: allPods, 120 | testRType: string(testjobv1.TestReplicaTypeWorker), 121 | expectedPods: filteredPods, 122 | } 123 | }(), 124 | } 125 | 126 | testReconciler := test_job.NewTestReconciler() 127 | 128 | for _, c := range testCase { 129 | filtered, err := testReconciler.FilterPodsForReplicaType(c.testPods, c.testRType) 130 | if err != nil { 131 | t.Errorf("FilterPodsForReplicaType returns error %v", err) 132 | } 133 | for _, ep := range c.expectedPods { 134 | if !PodInSlice(ep, filtered) { 135 | t.Errorf("Cannot found expected pod %s", ep.GetName()) 136 | } 137 | } 138 | 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/service_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common_test 16 | 17 | import ( 18 | "reflect" 19 | "strings" 20 | "testing" 21 | 22 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 24 | "github.com/kubeflow/common/test_job/reconciler.v1/test_job" 25 | test_utilv1 "github.com/kubeflow/common/test_job/test_util/v1" 26 | 27 | corev1 "k8s.io/api/core/v1" 28 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 | ) 30 | 31 | func TestCreateNewService(t *testing.T) { 32 | type tc struct { 33 | testJob *testjobv1.TestJob 34 | testRType commonv1.ReplicaType 35 | testSpec *commonv1.ReplicaSpec 36 | testIndex string 37 | expectedService *corev1.Service 38 | } 39 | testCase := []tc{ 40 | func() tc { 41 | tj := test_utilv1.NewTestJob(3) 42 | jobName := "testjob1" 43 | tj.SetName(jobName) 44 | idx := "0" 45 | svc := &corev1.Service{ 46 | ObjectMeta: metav1.ObjectMeta{ 47 | Name: jobName + "-worker-" + idx, 48 | Namespace: corev1.NamespaceDefault, 49 | }, 50 | Spec: corev1.ServiceSpec{ 51 | Ports: []corev1.ServicePort{ 52 | corev1.ServicePort{ 53 | Name: testjobv1.DefaultPortName, 54 | Port: testjobv1.DefaultPort, 55 | }, 56 | }, 57 | ClusterIP: corev1.ClusterIPNone, 58 | Selector: map[string]string{ 59 | commonv1.OperatorNameLabel: "Test Reconciler", 60 | commonv1.JobNameLabel: jobName, 61 | commonv1.ReplicaTypeLabel: strings.ToLower(string(testjobv1.TestReplicaTypeWorker)), 62 | commonv1.ReplicaIndexLabel: idx, 63 | }, 64 | }, 65 | } 66 | return tc{ 67 | testJob: tj, 68 | testRType: commonv1.ReplicaType(testjobv1.TestReplicaTypeWorker), 69 | testSpec: tj.Spec.TestReplicaSpecs[testjobv1.TestReplicaTypeWorker], 70 | testIndex: idx, 71 | expectedService: svc, 72 | } 73 | }(), 74 | } 75 | testReconciler := test_job.NewTestReconciler() 76 | 77 | for _, c := range testCase { 78 | err := testReconciler.CreateNewService(c.testJob, c.testRType, c.testSpec, c.testIndex) 79 | if err != nil { 80 | t.Errorf("Got error when CreateNewService: %v", err) 81 | continue 82 | } 83 | 84 | found := false 85 | for _, obj := range testReconciler.DC.Cache { 86 | if obj.GetName() == c.expectedService.GetName() && obj.GetNamespace() == c.expectedService.GetNamespace() { 87 | found = true 88 | svcCreated := obj.(*corev1.Service) 89 | svcExpected := c.expectedService 90 | if !reflect.DeepEqual(svcExpected.Spec, svcCreated.Spec) { 91 | t.Errorf("Spec mismatch for service %s/%s", svcExpected.GetNamespace(), svcExpected.GetName()) 92 | } 93 | } 94 | } 95 | 96 | if !found { 97 | t.Errorf("Cannot find Service %s/%s created", c.expectedService.GetNamespace(), c.expectedService.GetName()) 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/utils.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common 16 | 17 | import ( 18 | "github.com/go-logr/logr" 19 | "k8s.io/apimachinery/pkg/runtime" 20 | "k8s.io/apimachinery/pkg/types" 21 | "k8s.io/client-go/tools/record" 22 | "sigs.k8s.io/controller-runtime/pkg/client" 23 | ) 24 | 25 | const ReconcilerName = "common-reconciler" 26 | 27 | // GetReconcilerName returns the name of this reconciler, which is "common-reconciler" 28 | func (r *ReconcilerUtil) GetReconcilerName() string { 29 | return ReconcilerName 30 | } 31 | 32 | // ReconcilerUtil defines a reconciler with utility features 33 | type ReconcilerUtil struct { 34 | Recorder record.EventRecorder 35 | Log logr.Logger 36 | Scheme *runtime.Scheme 37 | } 38 | 39 | // BareUtilReconciler returns a pointer of ReconcilerUtil with default implementation 40 | func BareUtilReconciler( 41 | recorder record.EventRecorder, 42 | log logr.Logger, 43 | scheme *runtime.Scheme) *ReconcilerUtil { 44 | return &ReconcilerUtil{ 45 | Recorder: recorder, 46 | Log: log, 47 | Scheme: scheme, 48 | } 49 | } 50 | 51 | // GetRecorder returns a record.EventRecorder 52 | func (r *ReconcilerUtil) GetRecorder() record.EventRecorder { 53 | return r.Recorder 54 | } 55 | 56 | // GetLogger returns a logr.Logger 57 | func (r *ReconcilerUtil) GetLogger(job client.Object) logr.Logger { 58 | return r.Log.WithValues( 59 | job.GetObjectKind().GroupVersionKind().Kind, 60 | types.NamespacedName{Name: job.GetName(), Namespace: job.GetNamespace()}.String()) 61 | } 62 | 63 | // GetScheme returns the pointer of runtime.Schemes that is used in this reconciler 64 | func (r *ReconcilerUtil) GetScheme() *runtime.Scheme { 65 | return r.Scheme 66 | } 67 | -------------------------------------------------------------------------------- /pkg/reconciler.v1/common/utils_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package common_test 16 | 17 | import ( 18 | "testing" 19 | 20 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 21 | 22 | "github.com/kubeflow/common/test_job/reconciler.v1/test_job" 23 | ) 24 | 25 | func TestGenLabels(t *testing.T) { 26 | type tc struct { 27 | testJobName string 28 | expectedLabel map[string]string 29 | } 30 | testCase := []tc{ 31 | func() tc { 32 | return tc{ 33 | testJobName: "test/job1", 34 | expectedLabel: map[string]string{ 35 | commonv1.JobNameLabel: "test-job1", 36 | commonv1.OperatorNameLabel: "Test Reconciler", 37 | }, 38 | } 39 | }(), 40 | } 41 | 42 | testReconciler := test_job.NewTestReconciler() 43 | 44 | for _, c := range testCase { 45 | labels := testReconciler.GenLabels(c.testJobName) 46 | if len(labels) != len(c.expectedLabel) { 47 | t.Errorf("Expected to get %d labels, got %d labels", len(c.expectedLabel), len(labels)) 48 | continue 49 | } 50 | for ek, ev := range c.expectedLabel { 51 | if v, ok := labels[ek]; !ok { 52 | t.Errorf("Cannot found expected key %s", ek) 53 | } else { 54 | if ev != v { 55 | t.Errorf("Expected to get %s for %s, got %s", ev, ek, v) 56 | } 57 | } 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pkg/util/counter.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | type Counter struct { 9 | lock sync.Mutex 10 | data map[string]int 11 | } 12 | 13 | func NewCounter() *Counter { 14 | return &Counter{ 15 | lock: sync.Mutex{}, 16 | data: map[string]int{}, 17 | } 18 | } 19 | 20 | func (c *Counter) Inc(key string) { 21 | c.lock.Lock() 22 | defer c.lock.Unlock() 23 | 24 | v, ok := c.data[key] 25 | if ok { 26 | c.data[key] = v + 1 27 | return 28 | } 29 | c.data[key] = 0 30 | } 31 | 32 | func (c *Counter) DeleteKey(key string) { 33 | c.lock.Lock() 34 | defer c.lock.Lock() 35 | 36 | delete(c.data, key) 37 | } 38 | 39 | func (c *Counter) Counts(key string) (int, error) { 40 | c.lock.Lock() 41 | defer c.lock.Unlock() 42 | 43 | v, ok := c.data[key] 44 | if !ok { 45 | return 0, fmt.Errorf("cannot get key %s", key) 46 | } 47 | var err error = nil 48 | if v < 0 { 49 | err = fmt.Errorf("count %s:%d is negative", key, v) 50 | } 51 | return v, err 52 | } 53 | 54 | func (c *Counter) Dec(key string) error { 55 | c.lock.Lock() 56 | defer c.lock.Unlock() 57 | 58 | v, ok := c.data[key] 59 | if ok { 60 | if v > 1 { 61 | c.data[key] = v - 1 62 | return nil 63 | } 64 | if v == 1 { 65 | c.DeleteKey(key) 66 | return nil 67 | } 68 | return fmt.Errorf("cannot minus one: key %s has value %d", key, v) 69 | } 70 | return fmt.Errorf("cannot find key %s", key) 71 | } 72 | -------------------------------------------------------------------------------- /pkg/util/k8sutil/client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package k8sutil 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "net/http" 23 | 24 | "github.com/kubeflow/common/pkg/util" 25 | metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 26 | "k8s.io/apimachinery/pkg/runtime" 27 | "k8s.io/apimachinery/pkg/runtime/schema" 28 | "k8s.io/apimachinery/pkg/runtime/serializer" 29 | "k8s.io/client-go/kubernetes/scheme" 30 | "k8s.io/client-go/rest" 31 | ) 32 | 33 | // CRDRestClient defines an interface for working with CRDs using the REST client. 34 | // In most cases we want to use the auto-generated clientset for specific CRDs. 35 | // The only exception is when the CRD spec is invalid and we can't parse the type into the corresponding 36 | // go struct. 37 | type CRDClient interface { 38 | // Update a Job. 39 | Update(obj *metav1unstructured.Unstructured) error 40 | } 41 | 42 | // CRDRestClient uses the Kubernetes rest interface to talk to the CRD. 43 | type CRDRestClient struct { 44 | restcli *rest.RESTClient 45 | } 46 | 47 | func NewCRDRestClient(version *schema.GroupVersion) (*CRDRestClient, error) { 48 | config, err := GetClusterConfig() 49 | if err != nil { 50 | return nil, err 51 | } 52 | config.GroupVersion = version 53 | config.APIPath = "/apis" 54 | config.ContentType = runtime.ContentTypeJSON 55 | config.NegotiatedSerializer = serializer.WithoutConversionCodecFactory{CodecFactory: scheme.Codecs} 56 | 57 | restcli, err := rest.RESTClientFor(config) 58 | if err != nil { 59 | return nil, err 60 | } 61 | 62 | cli := &CRDRestClient{ 63 | restcli: restcli, 64 | } 65 | return cli, nil 66 | } 67 | 68 | // HttpClient returns the http client used. 69 | func (c *CRDRestClient) Client() *http.Client { 70 | return c.restcli.Client 71 | } 72 | 73 | func (c *CRDRestClient) Update(obj *metav1unstructured.Unstructured, plural string) error { 74 | logger := util.LoggerForUnstructured(obj, obj.GetKind()) 75 | // TODO(jlewi): Can we just call obj.GetKind() to get the kind? I think that will return the singular 76 | // not plural will that work? 77 | if plural == "" { 78 | logger.Errorf("Could not issue update because plural not set.") 79 | return fmt.Errorf("plural must be set") 80 | } 81 | r := c.restcli.Put().Resource(plural).Namespace(obj.GetNamespace()).Name(obj.GetName()).Body(obj) 82 | _, err := r.DoRaw(context.TODO()) 83 | if err != nil { 84 | logger.Errorf("Could not issue update using URL: %v; error; %v", r.URL().String(), err) 85 | } 86 | return err 87 | } 88 | 89 | func (c *CRDRestClient) UpdateStatus(obj *metav1unstructured.Unstructured, plural string) error { 90 | logger := util.LoggerForUnstructured(obj, obj.GetKind()) 91 | if plural == "" { 92 | logger.Errorf("Could not issue update because plural not set.") 93 | return fmt.Errorf("plural must be set") 94 | } 95 | r := c.restcli.Put().Resource(plural).Namespace(obj.GetNamespace()).Name(obj.GetName()).SubResource("status").Body(obj) 96 | _, err := r.DoRaw(context.TODO()) 97 | if err != nil { 98 | logger.Errorf("Could not issue update using URL: %v; error; %v", r.URL().String(), err) 99 | } 100 | return err 101 | } 102 | -------------------------------------------------------------------------------- /pkg/util/k8sutil/k8sutil.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package k8sutil 16 | 17 | import ( 18 | "net" 19 | "os" 20 | 21 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 22 | log "github.com/sirupsen/logrus" 23 | "k8s.io/api/core/v1" 24 | apierrors "k8s.io/apimachinery/pkg/api/errors" 25 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 | "k8s.io/client-go/kubernetes" 27 | _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" // for gcp auth 28 | "k8s.io/client-go/rest" 29 | "k8s.io/client-go/tools/clientcmd" 30 | ) 31 | 32 | // RecommendedConfigPathEnvVar is a environment variable for path configuration 33 | const RecommendedConfigPathEnvVar = "KUBECONFIG" 34 | 35 | // MustNewKubeClient returns new kubernetes client for cluster configuration 36 | func MustNewKubeClient() kubernetes.Interface { 37 | cfg, err := GetClusterConfig() 38 | if err != nil { 39 | log.Fatal(err) 40 | } 41 | return kubernetes.NewForConfigOrDie(cfg) 42 | } 43 | 44 | // GetClusterConfig obtain the config from the Kube configuration used by kubeconfig, or from k8s cluster. 45 | func GetClusterConfig() (*rest.Config, error) { 46 | if len(os.Getenv(RecommendedConfigPathEnvVar)) > 0 { 47 | // use the current context in kubeconfig 48 | // This is very useful for running locally. 49 | return clientcmd.BuildConfigFromFlags("", os.Getenv(RecommendedConfigPathEnvVar)) 50 | } 51 | 52 | // Work around https://github.com/kubernetes/kubernetes/issues/40973 53 | // See https://github.com/coreos/etcd-operator/issues/731#issuecomment-283804819 54 | if len(os.Getenv("KUBERNETES_SERVICE_HOST")) == 0 { 55 | addrs, err := net.LookupHost("kubernetes.default.svc") 56 | if err != nil { 57 | panic(err) 58 | } 59 | if err := os.Setenv("KUBERNETES_SERVICE_HOST", addrs[0]); err != nil { 60 | return nil, err 61 | } 62 | } 63 | if len(os.Getenv("KUBERNETES_SERVICE_PORT")) == 0 { 64 | if err := os.Setenv("KUBERNETES_SERVICE_PORT", "443"); err != nil { 65 | panic(err) 66 | } 67 | } 68 | return rest.InClusterConfig() 69 | } 70 | 71 | // IsKubernetesResourceAlreadyExistError throws error when kubernetes resources already exist. 72 | func IsKubernetesResourceAlreadyExistError(err error) bool { 73 | return apierrors.IsAlreadyExists(err) 74 | } 75 | 76 | // IsKubernetesResourceNotFoundError throws error when there is no kubernetes resource found. 77 | func IsKubernetesResourceNotFoundError(err error) bool { 78 | return apierrors.IsNotFound(err) 79 | } 80 | 81 | // TODO(jlewi): CascadeDeletOptions are part of garbage collection policy. 82 | // CascadeDeleteOptions deletes the workload after the grace period 83 | // Do we want to use this? See 84 | // https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/ 85 | func CascadeDeleteOptions(gracePeriodSeconds int64) *metav1.DeleteOptions { 86 | return &metav1.DeleteOptions{ 87 | GracePeriodSeconds: func(t int64) *int64 { return &t }(gracePeriodSeconds), 88 | PropagationPolicy: func() *metav1.DeletionPropagation { 89 | foreground := metav1.DeletePropagationForeground 90 | return &foreground 91 | }(), 92 | } 93 | } 94 | 95 | // FilterActivePods returns pods that have not terminated. 96 | func FilterActivePods(pods []*v1.Pod) []*v1.Pod { 97 | var result []*v1.Pod 98 | for _, p := range pods { 99 | if IsPodActive(p) { 100 | result = append(result, p) 101 | } else { 102 | log.Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v", 103 | p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp) 104 | } 105 | } 106 | return result 107 | } 108 | 109 | func IsPodActive(p *v1.Pod) bool { 110 | return v1.PodSucceeded != p.Status.Phase && 111 | v1.PodFailed != p.Status.Phase && 112 | p.DeletionTimestamp == nil 113 | } 114 | 115 | // filterPodCount returns pods based on their phase. 116 | func FilterPodCount(pods []*v1.Pod, phase v1.PodPhase) int32 { 117 | var result int32 118 | for i := range pods { 119 | if phase == pods[i].Status.Phase { 120 | result++ 121 | } 122 | } 123 | return result 124 | } 125 | 126 | func GetTotalReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) int32 { 127 | jobReplicas := int32(0) 128 | for _, r := range replicas { 129 | if r.Replicas != nil { 130 | jobReplicas += *r.Replicas 131 | } else { 132 | // If unspecified, defaults to 1. 133 | jobReplicas += 1 134 | } 135 | } 136 | return jobReplicas 137 | } 138 | 139 | func GetTotalFailedReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaStatus) int32 { 140 | totalFailedReplicas := int32(0) 141 | for _, status := range replicas { 142 | totalFailedReplicas += status.Failed 143 | } 144 | return totalFailedReplicas 145 | } 146 | -------------------------------------------------------------------------------- /pkg/util/labels/labels.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package labels 16 | 17 | import ( 18 | "errors" 19 | "strconv" 20 | 21 | v1 "github.com/kubeflow/common/pkg/apis/common/v1" 22 | ) 23 | 24 | func ReplicaIndex(labels map[string]string) (int, error) { 25 | v, ok := labels[v1.ReplicaIndexLabel] 26 | if !ok { 27 | return 0, errors.New("replica index label not found") 28 | } 29 | return strconv.Atoi(v) 30 | } 31 | 32 | func SetReplicaIndex(labels map[string]string, idx int) { 33 | SetReplicaIndexStr(labels, strconv.Itoa(idx)) 34 | } 35 | 36 | func SetReplicaIndexStr(labels map[string]string, idx string) { 37 | labels[v1.ReplicaIndexLabel] = idx 38 | } 39 | 40 | func ReplicaType(labels map[string]string) (v1.ReplicaType, error) { 41 | v, ok := labels[v1.ReplicaTypeLabel] 42 | if !ok { 43 | return "", errors.New("replica type label not found") 44 | } 45 | return v1.ReplicaType(v), nil 46 | } 47 | 48 | func SetReplicaType(labels map[string]string, rt string) { 49 | labels[v1.ReplicaTypeLabel] = rt 50 | } 51 | 52 | func HasKnownLabels(labels map[string]string, groupName string) bool { 53 | _, has := labels[v1.OperatorNameLabel] 54 | return has 55 | } 56 | 57 | func SetJobRole(labels map[string]string, role string) { 58 | labels[v1.JobRoleLabel] = role 59 | } 60 | -------------------------------------------------------------------------------- /pkg/util/labels/labels_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2023 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package labels 18 | 19 | import ( 20 | "testing" 21 | 22 | v1 "github.com/kubeflow/common/pkg/apis/common/v1" 23 | ) 24 | 25 | func TestReplicaIndex(t *testing.T) { 26 | cases := map[string]struct { 27 | labels map[string]string 28 | want int 29 | wantErr bool 30 | }{ 31 | "new": { 32 | labels: map[string]string{ 33 | v1.ReplicaIndexLabel: "2", 34 | }, 35 | want: 2, 36 | }, 37 | "old": { 38 | labels: map[string]string{ 39 | v1.ReplicaIndexLabel: "3", 40 | }, 41 | want: 3, 42 | }, 43 | "none": { 44 | labels: map[string]string{}, 45 | wantErr: true, 46 | }, 47 | "both": { 48 | labels: map[string]string{ 49 | v1.ReplicaIndexLabel: "4", 50 | }, 51 | want: 4, 52 | }, 53 | } 54 | for name, tc := range cases { 55 | t.Run(name, func(t *testing.T) { 56 | got, err := ReplicaIndex(tc.labels) 57 | if gotErr := err != nil; tc.wantErr != gotErr { 58 | t.Errorf("ReplicaIndex returned error (%t) want (%t)", gotErr, tc.wantErr) 59 | } 60 | if got != tc.want { 61 | t.Errorf("ReplicaIndex returned %d, want %d", got, tc.want) 62 | } 63 | }) 64 | } 65 | } 66 | 67 | func TestReplicaType(t *testing.T) { 68 | cases := map[string]struct { 69 | labels map[string]string 70 | want v1.ReplicaType 71 | wantErr bool 72 | }{ 73 | "new": { 74 | labels: map[string]string{ 75 | v1.ReplicaTypeLabel: "Foo", 76 | }, 77 | want: "Foo", 78 | }, 79 | "old": { 80 | labels: map[string]string{ 81 | v1.ReplicaTypeLabel: "Bar", 82 | }, 83 | want: "Bar", 84 | }, 85 | "none": { 86 | labels: map[string]string{}, 87 | wantErr: true, 88 | }, 89 | "both": { 90 | labels: map[string]string{ 91 | v1.ReplicaTypeLabel: "Baz", 92 | }, 93 | want: "Baz", 94 | }, 95 | } 96 | for name, tc := range cases { 97 | t.Run(name, func(t *testing.T) { 98 | got, err := ReplicaType(tc.labels) 99 | if gotErr := err != nil; tc.wantErr != gotErr { 100 | t.Errorf("ReplicaType returned error (%t) want (%t)", gotErr, tc.wantErr) 101 | } 102 | if got != tc.want { 103 | t.Errorf("ReplicaType returned %v, want %v", got, tc.want) 104 | } 105 | }) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /pkg/util/logger.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "strings" 19 | 20 | log "github.com/sirupsen/logrus" 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 24 | ) 25 | 26 | func LoggerForReplica(job metav1.Object, rtype string) *log.Entry { 27 | return log.WithFields(log.Fields{ 28 | // We use job to match the key used in controller.go 29 | // Its more common in K8s to use a period to indicate namespace.name. So that's what we use. 30 | "job": job.GetNamespace() + "." + job.GetName(), 31 | "uid": job.GetUID(), 32 | "replica-type": rtype, 33 | }) 34 | } 35 | 36 | func LoggerForJob(job metav1.Object) *log.Entry { 37 | return log.WithFields(log.Fields{ 38 | // We use job to match the key used in controller.go 39 | // Its more common in K8s to use a period to indicate namespace.name. So that's what we use. 40 | "job": job.GetNamespace() + "." + job.GetName(), 41 | "uid": job.GetUID(), 42 | }) 43 | } 44 | 45 | func LoggerForPod(pod *v1.Pod, kind string) *log.Entry { 46 | job := "" 47 | if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 48 | if controllerRef.Kind == kind { 49 | job = pod.Namespace + "." + controllerRef.Name 50 | } 51 | } 52 | return log.WithFields(log.Fields{ 53 | // We use job to match the key used in controller.go 54 | // In controller.go we log the key used with the workqueue. 55 | "job": job, 56 | "pod": pod.Namespace + "." + pod.Name, 57 | "uid": pod.ObjectMeta.UID, 58 | }) 59 | } 60 | 61 | func LoggerForService(svc *v1.Service, kind string) *log.Entry { 62 | job := "" 63 | if controllerRef := metav1.GetControllerOf(svc); controllerRef != nil { 64 | if controllerRef.Kind == kind { 65 | job = svc.Namespace + "." + controllerRef.Name 66 | } 67 | } 68 | return log.WithFields(log.Fields{ 69 | // We use job to match the key used in controller.go 70 | // In controller.go we log the key used with the workqueue. 71 | "job": job, 72 | "service": svc.Namespace + "." + svc.Name, 73 | "uid": svc.ObjectMeta.UID, 74 | }) 75 | } 76 | 77 | func LoggerForKey(key string) *log.Entry { 78 | return log.WithFields(log.Fields{ 79 | // The key used by the workQueue should be namespace + "/" + name. 80 | // Its more common in K8s to use a period to indicate namespace.name. So that's what we use. 81 | "job": strings.Replace(key, "/", ".", -1), 82 | }) 83 | } 84 | 85 | func LoggerForUnstructured(obj *metav1unstructured.Unstructured, kind string) *log.Entry { 86 | job := "" 87 | if obj.GetKind() == kind { 88 | job = obj.GetNamespace() + "." + obj.GetName() 89 | } 90 | return log.WithFields(log.Fields{ 91 | // We use job to match the key used in controller.go 92 | // In controller.go we log the key used with the workqueue. 93 | "job": job, 94 | "uid": obj.GetUID(), 95 | }) 96 | } 97 | -------------------------------------------------------------------------------- /pkg/util/signals/signal.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2017 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package signals 18 | 19 | import ( 20 | "os" 21 | "os/signal" 22 | ) 23 | 24 | var onlyOneSignalHandler = make(chan struct{}) 25 | 26 | // SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned 27 | // which is closed on one of these signals. If a second signal is caught, the program 28 | // is terminated with exit code 1. 29 | func SetupSignalHandler() (stopCh <-chan struct{}) { 30 | close(onlyOneSignalHandler) // panics when called twice 31 | 32 | stop := make(chan struct{}) 33 | c := make(chan os.Signal, 2) 34 | signal.Notify(c, shutdownSignals...) 35 | go func() { 36 | <-c 37 | close(stop) 38 | <-c 39 | os.Exit(1) // second signal. Exit directly. 40 | }() 41 | 42 | return stop 43 | } 44 | -------------------------------------------------------------------------------- /pkg/util/signals/signal_posix.go: -------------------------------------------------------------------------------- 1 | //go:build !windows 2 | // +build !windows 3 | 4 | /* 5 | Copyright 2017 The Kubeflow Authors. 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | */ 19 | 20 | package signals 21 | 22 | import ( 23 | "os" 24 | "syscall" 25 | ) 26 | 27 | var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM} 28 | -------------------------------------------------------------------------------- /pkg/util/signals/signal_windows.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2017 The Kubeflow Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package signals 18 | 19 | import ( 20 | "os" 21 | ) 22 | 23 | var shutdownSignals = []os.Signal{os.Interrupt} 24 | -------------------------------------------------------------------------------- /pkg/util/status.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 5 | v1 "k8s.io/api/core/v1" 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | ) 8 | 9 | const ( 10 | // JobCreatedReason is added in a job when it is created. 11 | JobCreatedReason = "JobCreated" 12 | // JobSucceededReason is added in a job when it is succeeded. 13 | JobSucceededReason = "JobSucceeded" 14 | // JobRunningReason is added in a job when it is running. 15 | JobRunningReason = "JobRunning" 16 | // JobFailedReason is added in a job when it is failed. 17 | JobFailedReason = "JobFailed" 18 | // JobRestartingReason is added in a job when it is restarting. 19 | JobRestartingReason = "JobRestarting" 20 | // JobFailedValidationReason is added in a job when it failed validation 21 | JobFailedValidationReason = "JobFailedValidation" 22 | 23 | // labels for pods and servers. 24 | 25 | ) 26 | 27 | // IsSucceeded checks if the job is succeeded 28 | func IsSucceeded(status apiv1.JobStatus) bool { 29 | return hasCondition(status, apiv1.JobSucceeded) 30 | } 31 | 32 | // IsFailed checks if the job is failed 33 | func IsFailed(status apiv1.JobStatus) bool { 34 | return hasCondition(status, apiv1.JobFailed) 35 | } 36 | 37 | // UpdateJobConditions adds to the jobStatus a new condition if needed, with the conditionType, reason, and message 38 | func UpdateJobConditions(jobStatus *apiv1.JobStatus, conditionType apiv1.JobConditionType, reason, message string) error { 39 | condition := newCondition(conditionType, reason, message) 40 | setCondition(jobStatus, condition) 41 | return nil 42 | } 43 | 44 | func hasCondition(status apiv1.JobStatus, condType apiv1.JobConditionType) bool { 45 | for _, condition := range status.Conditions { 46 | if condition.Type == condType && condition.Status == v1.ConditionTrue { 47 | return true 48 | } 49 | } 50 | return false 51 | } 52 | 53 | // newCondition creates a new job condition. 54 | func newCondition(conditionType apiv1.JobConditionType, reason, message string) apiv1.JobCondition { 55 | return apiv1.JobCondition{ 56 | Type: conditionType, 57 | Status: v1.ConditionTrue, 58 | LastUpdateTime: metav1.Now(), 59 | LastTransitionTime: metav1.Now(), 60 | Reason: reason, 61 | Message: message, 62 | } 63 | } 64 | 65 | // getCondition returns the condition with the provided type. 66 | func getCondition(status apiv1.JobStatus, condType apiv1.JobConditionType) *apiv1.JobCondition { 67 | for _, condition := range status.Conditions { 68 | if condition.Type == condType { 69 | return &condition 70 | } 71 | } 72 | return nil 73 | } 74 | 75 | // setCondition updates the job to include the provided condition. 76 | // If the condition that we are about to add already exists 77 | // and has the same status and reason then we are not going to update. 78 | func setCondition(status *apiv1.JobStatus, condition apiv1.JobCondition) { 79 | // Do nothing if JobStatus have failed condition 80 | if IsFailed(*status) { 81 | return 82 | } 83 | 84 | currentCond := getCondition(*status, condition.Type) 85 | 86 | // Do nothing if condition doesn't change 87 | if currentCond != nil && currentCond.Status == condition.Status && currentCond.Reason == condition.Reason { 88 | return 89 | } 90 | 91 | // Do not update lastTransitionTime if the status of the condition doesn't change. 92 | if currentCond != nil && currentCond.Status == condition.Status { 93 | condition.LastTransitionTime = currentCond.LastTransitionTime 94 | } 95 | 96 | // Append the updated condition to the conditions 97 | newConditions := filterOutCondition(status.Conditions, condition.Type) 98 | status.Conditions = append(newConditions, condition) 99 | } 100 | 101 | // filterOutCondition returns a new slice of job conditions without conditions with the provided type. 102 | func filterOutCondition(conditions []apiv1.JobCondition, condType apiv1.JobConditionType) []apiv1.JobCondition { 103 | var newConditions []apiv1.JobCondition 104 | for _, c := range conditions { 105 | if condType == apiv1.JobRestarting && c.Type == apiv1.JobRunning { 106 | continue 107 | } 108 | if condType == apiv1.JobRunning && c.Type == apiv1.JobRestarting { 109 | continue 110 | } 111 | 112 | if c.Type == condType { 113 | continue 114 | } 115 | 116 | // Set the running condition status to be false when current condition failed or succeeded 117 | if (condType == apiv1.JobFailed || condType == apiv1.JobSucceeded) && c.Type == apiv1.JobRunning { 118 | c.Status = v1.ConditionFalse 119 | } 120 | 121 | newConditions = append(newConditions, c) 122 | } 123 | return newConditions 124 | } 125 | -------------------------------------------------------------------------------- /pkg/util/status_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "testing" 5 | 6 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 7 | "github.com/stretchr/testify/assert" 8 | corev1 "k8s.io/api/core/v1" 9 | ) 10 | 11 | func TestIsSucceeded(t *testing.T) { 12 | jobStatus := apiv1.JobStatus{ 13 | Conditions: []apiv1.JobCondition{ 14 | { 15 | Type: apiv1.JobSucceeded, 16 | Status: corev1.ConditionTrue, 17 | }, 18 | }, 19 | } 20 | assert.True(t, IsSucceeded(jobStatus)) 21 | } 22 | 23 | func TestIsFailed(t *testing.T) { 24 | jobStatus := apiv1.JobStatus{ 25 | Conditions: []apiv1.JobCondition{ 26 | { 27 | Type: apiv1.JobFailed, 28 | Status: corev1.ConditionTrue, 29 | }, 30 | }, 31 | } 32 | assert.True(t, IsFailed(jobStatus)) 33 | } 34 | 35 | func TestUpdateJobConditions(t *testing.T) { 36 | jobStatus := apiv1.JobStatus{} 37 | conditionType := apiv1.JobCreated 38 | reason := "Job Created" 39 | message := "Job Created" 40 | 41 | err := UpdateJobConditions(&jobStatus, conditionType, reason, message) 42 | if assert.NoError(t, err) { 43 | // Check JobCreated condition is appended 44 | conditionInStatus := jobStatus.Conditions[0] 45 | assert.Equal(t, conditionInStatus.Type, conditionType) 46 | assert.Equal(t, conditionInStatus.Reason, reason) 47 | assert.Equal(t, conditionInStatus.Message, message) 48 | } 49 | 50 | conditionType = apiv1.JobRunning 51 | reason = "Job Running" 52 | message = "Job Running" 53 | err = UpdateJobConditions(&jobStatus, conditionType, reason, message) 54 | if assert.NoError(t, err) { 55 | // Check JobRunning condition is appended 56 | conditionInStatus := jobStatus.Conditions[1] 57 | assert.Equal(t, conditionInStatus.Type, conditionType) 58 | assert.Equal(t, conditionInStatus.Reason, reason) 59 | assert.Equal(t, conditionInStatus.Message, message) 60 | } 61 | 62 | conditionType = apiv1.JobRestarting 63 | reason = "Job Restarting" 64 | message = "Job Restarting" 65 | err = UpdateJobConditions(&jobStatus, conditionType, reason, message) 66 | if assert.NoError(t, err) { 67 | // Check JobRunning condition is filtered out and JobRestarting state is appended 68 | conditionInStatus := jobStatus.Conditions[1] 69 | assert.Equal(t, conditionInStatus.Type, conditionType) 70 | assert.Equal(t, conditionInStatus.Reason, reason) 71 | assert.Equal(t, conditionInStatus.Message, message) 72 | } 73 | 74 | conditionType = apiv1.JobRunning 75 | reason = "Job Running" 76 | message = "Job Running" 77 | err = UpdateJobConditions(&jobStatus, conditionType, reason, message) 78 | if assert.NoError(t, err) { 79 | // Again, Check JobRestarting condition is filtered and JobRestarting is appended 80 | conditionInStatus := jobStatus.Conditions[1] 81 | assert.Equal(t, conditionInStatus.Type, conditionType) 82 | assert.Equal(t, conditionInStatus.Reason, reason) 83 | assert.Equal(t, conditionInStatus.Message, message) 84 | } 85 | 86 | conditionType = apiv1.JobFailed 87 | reason = "Job Failed" 88 | message = "Job Failed" 89 | err = UpdateJobConditions(&jobStatus, conditionType, reason, message) 90 | if assert.NoError(t, err) { 91 | // Check JobRunning condition is set to false 92 | jobRunningCondition := jobStatus.Conditions[1] 93 | assert.Equal(t, jobRunningCondition.Type, apiv1.JobRunning) 94 | assert.Equal(t, jobRunningCondition.Status, corev1.ConditionFalse) 95 | // Check JobFailed state is appended 96 | conditionInStatus := jobStatus.Conditions[2] 97 | assert.Equal(t, conditionInStatus.Type, conditionType) 98 | assert.Equal(t, conditionInStatus.Reason, reason) 99 | assert.Equal(t, conditionInStatus.Message, message) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /pkg/util/train/train_util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package that various helper routines for training. 16 | package train 17 | 18 | func IsRetryableExitCode(exitCode int32) bool { 19 | return exitCode >= 128 20 | } 21 | -------------------------------------------------------------------------------- /pkg/util/train/train_util_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package train 16 | 17 | import "testing" 18 | 19 | func TestIsRetryableExitCode(t *testing.T) { 20 | tcs := []struct { 21 | ExitCode int32 22 | Expected bool 23 | }{ 24 | { 25 | ExitCode: 1, 26 | Expected: false, 27 | }, 28 | { 29 | ExitCode: 2, 30 | Expected: false, 31 | }, 32 | { 33 | ExitCode: 3, 34 | Expected: false, 35 | }, 36 | { 37 | ExitCode: 130, 38 | Expected: true, 39 | }, 40 | { 41 | ExitCode: 138, 42 | Expected: true, 43 | }, 44 | } 45 | 46 | for _, tc := range tcs { 47 | actual := IsRetryableExitCode(tc.ExitCode) 48 | if actual != tc.Expected { 49 | t.Errorf("ExitCode %d: Expected %t, got %t", tc.ExitCode, tc.Expected, actual) 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /pkg/util/util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package util provides various helper routines. 16 | package util 17 | 18 | import ( 19 | "encoding/json" 20 | "fmt" 21 | "math/rand" 22 | "time" 23 | 24 | log "github.com/sirupsen/logrus" 25 | _ "k8s.io/code-generator/pkg/util" 26 | _ "k8s.io/kube-openapi/pkg/common" 27 | ) 28 | 29 | const ( 30 | // EnvKubeflowNamespace is a environment variable for namespace when deployed on kubernetes 31 | EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" 32 | ) 33 | 34 | // Pformat returns a pretty format output of any value that can be marshaled to JSON. 35 | func Pformat(value interface{}) string { 36 | if s, ok := value.(string); ok { 37 | return s 38 | } 39 | valueJSON, err := json.MarshalIndent(value, "", " ") 40 | if err != nil { 41 | log.Warningf("Couldn't pretty format %v, error: %v", value, err) 42 | return fmt.Sprintf("%v", value) 43 | } 44 | return string(valueJSON) 45 | } 46 | 47 | // src is variable initialized with random value. 48 | var src = rand.NewSource(time.Now().UnixNano()) 49 | 50 | const letterBytes = "0123456789abcdefghijklmnopqrstuvwxyz" 51 | const ( 52 | letterIdxBits = 6 // 6 bits to represent a letter index 53 | letterIdxMask = 1<= 0; { 65 | if remain == 0 { 66 | cache, remain = src.Int63(), letterIdxMax 67 | } 68 | if idx := int(cache & letterIdxMask); idx < len(letterBytes) { 69 | b[i] = letterBytes[idx] 70 | i-- 71 | } 72 | cache >>= letterIdxBits 73 | remain-- 74 | } 75 | 76 | return string(b) 77 | } 78 | -------------------------------------------------------------------------------- /test_job/README.md: -------------------------------------------------------------------------------- 1 | ## Test Job Controller 2 | 3 | This is a Test Job Controller example. As you can see, we have job crd definition under `apis/test_job/v1`. 4 | [code-generator](https://github.com/kubernetes/code-generator) generate deepcopy, clientset and other libraries. 5 | 6 | `controler.v1/test_job/test_job_controller` defines a struct `TestJobController` which implements [commonv1.ControllerInterface](../pkg/apis/common/v1/interface.go) 7 | 8 | ```yaml 9 | ├── README.md 10 | ├── apis 11 | │   └── test_job 12 | │   └── v1 13 | │   ├── constants.go 14 | │   ├── defaults.go 15 | │   ├── doc.go 16 | │   ├── openapi_generated.go 17 | │   ├── register.go 18 | │   ├── types.go 19 | │   ├── zz_generated.deepcopy.go 20 | │   └── zz_generated.defaults.go 21 | ├── client 22 | │   ├── clientset 23 | │   ├── informers 24 | │   └── listers 25 | ├── controller.v1 26 | │   └── test_job 27 | │   └── test_job_controller.go 28 | └── test_util 29 | ``` -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/constants.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 19 | ) 20 | 21 | const ( 22 | // EnvKubeflowNamespace is ENV for kubeflow namespace specified by user. 23 | EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" 24 | 25 | // DefaultPortName is name of the port used to communicate between workers. 26 | DefaultPortName = "job-port" 27 | // DefaultContainerName is the name of the TestJob container. 28 | DefaultContainerName = "test-container" 29 | // DefaultPort is default value of the port. 30 | DefaultPort = 2222 31 | // DefaultRestartPolicy is default RestartPolicy for TFReplicaSpec. 32 | DefaultRestartPolicy = commonv1.RestartPolicyNever 33 | ) 34 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/defaults.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "strings" 19 | 20 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 21 | v1 "k8s.io/api/core/v1" 22 | "k8s.io/apimachinery/pkg/runtime" 23 | ) 24 | 25 | // Int32 is a helper routine that allocates a new int32 value 26 | // to store v and returns a pointer to it. 27 | func Int32(v int32) *int32 { 28 | return &v 29 | } 30 | 31 | func addDefaultingFuncs(scheme *runtime.Scheme) error { 32 | return RegisterDefaults(scheme) 33 | } 34 | 35 | // setDefaultPort sets the default ports for container. 36 | func setDefaultPort(spec *v1.PodSpec) { 37 | index := 0 38 | for i, container := range spec.Containers { 39 | if container.Name == DefaultContainerName { 40 | index = i 41 | break 42 | } 43 | } 44 | 45 | hasJobPort := false 46 | for _, port := range spec.Containers[index].Ports { 47 | if port.Name == DefaultPortName { 48 | hasJobPort = true 49 | break 50 | } 51 | } 52 | if !hasJobPort { 53 | spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ 54 | Name: DefaultPortName, 55 | ContainerPort: DefaultPort, 56 | }) 57 | } 58 | } 59 | 60 | func setDefaultReplicas(spec *commonv1.ReplicaSpec) { 61 | if spec.Replicas == nil { 62 | spec.Replicas = Int32(1) 63 | } 64 | if spec.RestartPolicy == "" { 65 | spec.RestartPolicy = DefaultRestartPolicy 66 | } 67 | } 68 | 69 | // setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. 70 | func setTypeNamesToCamelCase(testJob *TestJob) { 71 | setTypeNameToCamelCase(testJob, TestReplicaTypeWorker) 72 | setTypeNameToCamelCase(testJob, TestReplicaTypeMaster) 73 | } 74 | 75 | // setTypeNameToCamelCase sets the name of the replica type from any case to correct case. 76 | // E.g. from ps to PS; from WORKER to Worker. 77 | func setTypeNameToCamelCase(testJob *TestJob, typ TestReplicaType) { 78 | for t := range testJob.Spec.TestReplicaSpecs { 79 | if strings.EqualFold(string(t), string(typ)) && t != typ { 80 | spec := testJob.Spec.TestReplicaSpecs[t] 81 | delete(testJob.Spec.TestReplicaSpecs, t) 82 | testJob.Spec.TestReplicaSpecs[typ] = spec 83 | return 84 | } 85 | } 86 | } 87 | 88 | // SetDefaults_TestJob sets any unspecified values to defaults. 89 | func SetDefaults_TestJob(testjob *TestJob) { 90 | // Set default RunPolicy 91 | if testjob.Spec.RunPolicy == nil { 92 | testjob.Spec.RunPolicy = &commonv1.RunPolicy{ 93 | CleanPodPolicy: nil, 94 | TTLSecondsAfterFinished: nil, 95 | ActiveDeadlineSeconds: nil, 96 | BackoffLimit: nil, 97 | SchedulingPolicy: nil, 98 | } 99 | } 100 | 101 | // Set default cleanpod policy to Running. 102 | if testjob.Spec.RunPolicy.CleanPodPolicy == nil { 103 | running := commonv1.CleanPodPolicyRunning 104 | testjob.Spec.RunPolicy.CleanPodPolicy = &running 105 | } 106 | 107 | // Update the key of TestReplicaSpecs to camel case. 108 | setTypeNamesToCamelCase(testjob) 109 | 110 | for _, spec := range testjob.Spec.TestReplicaSpecs { 111 | // Set default replicas to 1. 112 | setDefaultReplicas(spec) 113 | // Set default port to the container. 114 | setDefaultPort(&spec.Template.Spec) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +k8s:deepcopy-gen=package,register 16 | // +k8s:defaulter-gen=TypeMeta 17 | // +k8s:openapi-gen=true 18 | 19 | // Package v1 is the v1 version of the API. 20 | // +groupName=kubeflow.org 21 | package v1 22 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 | "k8s.io/apimachinery/pkg/runtime" 20 | "k8s.io/apimachinery/pkg/runtime/schema" 21 | ) 22 | 23 | var ( 24 | // TODO: move SchemeBuilder with zz_generated.deepcopy.go to k8s.io/api. 25 | // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. 26 | SchemeBuilder runtime.SchemeBuilder 27 | localSchemeBuilder = &SchemeBuilder 28 | AddToScheme = localSchemeBuilder.AddToScheme 29 | ) 30 | 31 | const ( 32 | // GroupName is the group name use in this package. 33 | GroupName = "kubeflow.org" 34 | // Kind is the kind name. 35 | Kind = "TestJob" 36 | // GroupVersion is the version. 37 | GroupVersion = "v1" 38 | // Plural is the Plural for TestJob. 39 | Plural = "testjobs" 40 | // Singular is the singular for TestJob. 41 | Singular = "testjob" 42 | // TESTCRD is the CRD name for TestJob. 43 | TESTCRD = "testjobs.kubeflow.org" 44 | ) 45 | 46 | var ( 47 | // SchemeGroupVersion is the group version used to register these objects. 48 | SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: GroupVersion} 49 | // SchemeGroupVersionKind is the GroupVersionKind of the resource. 50 | SchemeGroupVersionKind = SchemeGroupVersion.WithKind(Kind) 51 | ) 52 | 53 | func init() { 54 | // We only register manually written functions here. The registration of the 55 | // generated functions takes place in the generated files. The separation 56 | // makes the code compile even when the generated files are missing. 57 | localSchemeBuilder.Register(addKnownTypes) 58 | localSchemeBuilder.Register(addDefaultingFuncs) 59 | } 60 | 61 | // Resource takes an unqualified resource and returns a Group-qualified GroupResource. 62 | func Resource(resource string) schema.GroupResource { 63 | return SchemeGroupVersion.WithResource(resource).GroupResource() 64 | } 65 | 66 | // addKnownTypes adds the set of types defined in this package to the supplied scheme. 67 | func addKnownTypes(scheme *runtime.Scheme) error { 68 | scheme.AddKnownTypes(SchemeGroupVersion, 69 | &TestJob{}, 70 | &TestJobList{}, 71 | ) 72 | metav1.AddToGroupVersion(scheme, SchemeGroupVersion) 73 | return nil 74 | } 75 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 19 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 | ) 21 | 22 | // +genclient 23 | // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object 24 | // +resource:path=testjob 25 | 26 | // A generic job used for unit tests. 27 | type TestJob struct { 28 | metav1.TypeMeta `json:",inline"` 29 | 30 | // Standard object's metadata. 31 | metav1.ObjectMeta `json:"metadata,omitempty"` 32 | 33 | // Specification of the desired behavior of the TestJob. 34 | Spec TestJobSpec `json:"spec,omitempty"` 35 | 36 | // Most recently observed status of the TestJob. 37 | // This data may not be up to date. 38 | // Populated by the system. 39 | // Read-only. 40 | Status commonv1.JobStatus `json:"status,omitempty"` 41 | } 42 | 43 | // TestJobSpec is a desired state description of the TestJob. 44 | type TestJobSpec struct { 45 | RunPolicy *commonv1.RunPolicy `json:"runPolicy,omitempty"` 46 | TestReplicaSpecs map[TestReplicaType]*commonv1.ReplicaSpec `json:"testReplicaSpecs"` 47 | } 48 | 49 | // TestReplicaType is the type for TestReplica. 50 | type TestReplicaType commonv1.ReplicaType 51 | 52 | const ( 53 | TestReplicaTypeWorker TestReplicaType = "Worker" 54 | TestReplicaTypeMaster TestReplicaType = "Master" 55 | ) 56 | 57 | // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object 58 | // +resource:path=testjobs 59 | 60 | // TestJobList is a list of TestJobs. 61 | type TestJobList struct { 62 | metav1.TypeMeta `json:",inline"` 63 | 64 | // Standard list metadata. 65 | metav1.ListMeta `json:"metadata,omitempty"` 66 | 67 | // List of TestJobs. 68 | Items []TestJob `json:"items"` 69 | } 70 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | // +build !ignore_autogenerated 3 | 4 | // Copyright 2023 The Kubeflow Authors 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | // Code generated by deepcopy-gen. DO NOT EDIT. 19 | 20 | package v1 21 | 22 | import ( 23 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 24 | runtime "k8s.io/apimachinery/pkg/runtime" 25 | ) 26 | 27 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 28 | func (in *TestJob) DeepCopyInto(out *TestJob) { 29 | *out = *in 30 | out.TypeMeta = in.TypeMeta 31 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 32 | in.Spec.DeepCopyInto(&out.Spec) 33 | in.Status.DeepCopyInto(&out.Status) 34 | return 35 | } 36 | 37 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TestJob. 38 | func (in *TestJob) DeepCopy() *TestJob { 39 | if in == nil { 40 | return nil 41 | } 42 | out := new(TestJob) 43 | in.DeepCopyInto(out) 44 | return out 45 | } 46 | 47 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 48 | func (in *TestJob) DeepCopyObject() runtime.Object { 49 | if c := in.DeepCopy(); c != nil { 50 | return c 51 | } 52 | return nil 53 | } 54 | 55 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 56 | func (in *TestJobList) DeepCopyInto(out *TestJobList) { 57 | *out = *in 58 | out.TypeMeta = in.TypeMeta 59 | in.ListMeta.DeepCopyInto(&out.ListMeta) 60 | if in.Items != nil { 61 | in, out := &in.Items, &out.Items 62 | *out = make([]TestJob, len(*in)) 63 | for i := range *in { 64 | (*in)[i].DeepCopyInto(&(*out)[i]) 65 | } 66 | } 67 | return 68 | } 69 | 70 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TestJobList. 71 | func (in *TestJobList) DeepCopy() *TestJobList { 72 | if in == nil { 73 | return nil 74 | } 75 | out := new(TestJobList) 76 | in.DeepCopyInto(out) 77 | return out 78 | } 79 | 80 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 81 | func (in *TestJobList) DeepCopyObject() runtime.Object { 82 | if c := in.DeepCopy(); c != nil { 83 | return c 84 | } 85 | return nil 86 | } 87 | 88 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 89 | func (in *TestJobSpec) DeepCopyInto(out *TestJobSpec) { 90 | *out = *in 91 | if in.RunPolicy != nil { 92 | in, out := &in.RunPolicy, &out.RunPolicy 93 | *out = new(commonv1.RunPolicy) 94 | (*in).DeepCopyInto(*out) 95 | } 96 | if in.TestReplicaSpecs != nil { 97 | in, out := &in.TestReplicaSpecs, &out.TestReplicaSpecs 98 | *out = make(map[TestReplicaType]*commonv1.ReplicaSpec, len(*in)) 99 | for key, val := range *in { 100 | var outVal *commonv1.ReplicaSpec 101 | if val == nil { 102 | (*out)[key] = nil 103 | } else { 104 | in, out := &val, &outVal 105 | *out = new(commonv1.ReplicaSpec) 106 | (*in).DeepCopyInto(*out) 107 | } 108 | (*out)[key] = outVal 109 | } 110 | } 111 | return 112 | } 113 | 114 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TestJobSpec. 115 | func (in *TestJobSpec) DeepCopy() *TestJobSpec { 116 | if in == nil { 117 | return nil 118 | } 119 | out := new(TestJobSpec) 120 | in.DeepCopyInto(out) 121 | return out 122 | } 123 | -------------------------------------------------------------------------------- /test_job/apis/test_job/v1/zz_generated.defaults.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | // +build !ignore_autogenerated 3 | 4 | // Copyright 2023 The Kubeflow Authors 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | // Code generated by defaulter-gen. DO NOT EDIT. 19 | 20 | package v1 21 | 22 | import ( 23 | runtime "k8s.io/apimachinery/pkg/runtime" 24 | ) 25 | 26 | // RegisterDefaults adds defaulters functions to the given scheme. 27 | // Public to allow building arbitrary schemes. 28 | // All generated defaulters are covering - they call all nested defaulters. 29 | func RegisterDefaults(scheme *runtime.Scheme) error { 30 | scheme.AddTypeDefaultingFunc(&TestJob{}, func(obj interface{}) { SetObjectDefaults_TestJob(obj.(*TestJob)) }) 31 | scheme.AddTypeDefaultingFunc(&TestJobList{}, func(obj interface{}) { SetObjectDefaults_TestJobList(obj.(*TestJobList)) }) 32 | return nil 33 | } 34 | 35 | func SetObjectDefaults_TestJob(in *TestJob) { 36 | SetDefaults_TestJob(in) 37 | } 38 | 39 | func SetObjectDefaults_TestJobList(in *TestJobList) { 40 | for i := range in.Items { 41 | a := &in.Items[i] 42 | SetObjectDefaults_TestJob(a) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/clientset.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package versioned 18 | 19 | import ( 20 | "fmt" 21 | "net/http" 22 | 23 | kubeflowv1 "github.com/kubeflow/common/test_job/client/clientset/versioned/typed/test_job/v1" 24 | discovery "k8s.io/client-go/discovery" 25 | rest "k8s.io/client-go/rest" 26 | flowcontrol "k8s.io/client-go/util/flowcontrol" 27 | ) 28 | 29 | type Interface interface { 30 | Discovery() discovery.DiscoveryInterface 31 | KubeflowV1() kubeflowv1.KubeflowV1Interface 32 | } 33 | 34 | // Clientset contains the clients for groups. Each group has exactly one 35 | // version included in a Clientset. 36 | type Clientset struct { 37 | *discovery.DiscoveryClient 38 | kubeflowV1 *kubeflowv1.KubeflowV1Client 39 | } 40 | 41 | // KubeflowV1 retrieves the KubeflowV1Client 42 | func (c *Clientset) KubeflowV1() kubeflowv1.KubeflowV1Interface { 43 | return c.kubeflowV1 44 | } 45 | 46 | // Discovery retrieves the DiscoveryClient 47 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 48 | if c == nil { 49 | return nil 50 | } 51 | return c.DiscoveryClient 52 | } 53 | 54 | // NewForConfig creates a new Clientset for the given config. 55 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 56 | // NewForConfig will generate a rate-limiter in configShallowCopy. 57 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 58 | // where httpClient was generated with rest.HTTPClientFor(c). 59 | func NewForConfig(c *rest.Config) (*Clientset, error) { 60 | configShallowCopy := *c 61 | 62 | if configShallowCopy.UserAgent == "" { 63 | configShallowCopy.UserAgent = rest.DefaultKubernetesUserAgent() 64 | } 65 | 66 | // share the transport between all clients 67 | httpClient, err := rest.HTTPClientFor(&configShallowCopy) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | return NewForConfigAndClient(&configShallowCopy, httpClient) 73 | } 74 | 75 | // NewForConfigAndClient creates a new Clientset for the given config and http client. 76 | // Note the http client provided takes precedence over the configured transport values. 77 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 78 | // NewForConfigAndClient will generate a rate-limiter in configShallowCopy. 79 | func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { 80 | configShallowCopy := *c 81 | if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { 82 | if configShallowCopy.Burst <= 0 { 83 | return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0") 84 | } 85 | configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) 86 | } 87 | 88 | var cs Clientset 89 | var err error 90 | cs.kubeflowV1, err = kubeflowv1.NewForConfigAndClient(&configShallowCopy, httpClient) 91 | if err != nil { 92 | return nil, err 93 | } 94 | 95 | cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) 96 | if err != nil { 97 | return nil, err 98 | } 99 | return &cs, nil 100 | } 101 | 102 | // NewForConfigOrDie creates a new Clientset for the given config and 103 | // panics if there is an error in the config. 104 | func NewForConfigOrDie(c *rest.Config) *Clientset { 105 | cs, err := NewForConfig(c) 106 | if err != nil { 107 | panic(err) 108 | } 109 | return cs 110 | } 111 | 112 | // New creates a new Clientset for the given RESTClient. 113 | func New(c rest.Interface) *Clientset { 114 | var cs Clientset 115 | cs.kubeflowV1 = kubeflowv1.New(c) 116 | 117 | cs.DiscoveryClient = discovery.NewDiscoveryClient(c) 118 | return &cs 119 | } 120 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated clientset. 18 | package versioned 19 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/fake/clientset_generated.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | clientset "github.com/kubeflow/common/test_job/client/clientset/versioned" 21 | kubeflowv1 "github.com/kubeflow/common/test_job/client/clientset/versioned/typed/test_job/v1" 22 | fakekubeflowv1 "github.com/kubeflow/common/test_job/client/clientset/versioned/typed/test_job/v1/fake" 23 | "k8s.io/apimachinery/pkg/runtime" 24 | "k8s.io/apimachinery/pkg/watch" 25 | "k8s.io/client-go/discovery" 26 | fakediscovery "k8s.io/client-go/discovery/fake" 27 | "k8s.io/client-go/testing" 28 | ) 29 | 30 | // NewSimpleClientset returns a clientset that will respond with the provided objects. 31 | // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, 32 | // without applying any validations and/or defaults. It shouldn't be considered a replacement 33 | // for a real clientset and is mostly useful in simple unit tests. 34 | func NewSimpleClientset(objects ...runtime.Object) *Clientset { 35 | o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) 36 | for _, obj := range objects { 37 | if err := o.Add(obj); err != nil { 38 | panic(err) 39 | } 40 | } 41 | 42 | cs := &Clientset{tracker: o} 43 | cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} 44 | cs.AddReactor("*", "*", testing.ObjectReaction(o)) 45 | cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { 46 | gvr := action.GetResource() 47 | ns := action.GetNamespace() 48 | watch, err := o.Watch(gvr, ns) 49 | if err != nil { 50 | return false, nil, err 51 | } 52 | return true, watch, nil 53 | }) 54 | 55 | return cs 56 | } 57 | 58 | // Clientset implements clientset.Interface. Meant to be embedded into a 59 | // struct to get a default implementation. This makes faking out just the method 60 | // you want to test easier. 61 | type Clientset struct { 62 | testing.Fake 63 | discovery *fakediscovery.FakeDiscovery 64 | tracker testing.ObjectTracker 65 | } 66 | 67 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 68 | return c.discovery 69 | } 70 | 71 | func (c *Clientset) Tracker() testing.ObjectTracker { 72 | return c.tracker 73 | } 74 | 75 | var ( 76 | _ clientset.Interface = &Clientset{} 77 | _ testing.FakeClient = &Clientset{} 78 | ) 79 | 80 | // KubeflowV1 retrieves the KubeflowV1Client 81 | func (c *Clientset) KubeflowV1() kubeflowv1.KubeflowV1Interface { 82 | return &fakekubeflowv1.FakeKubeflowV1{Fake: &c.Fake} 83 | } 84 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated fake clientset. 18 | package fake 19 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/fake/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | kubeflowv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 21 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 25 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 26 | ) 27 | 28 | var scheme = runtime.NewScheme() 29 | var codecs = serializer.NewCodecFactory(scheme) 30 | 31 | var localSchemeBuilder = runtime.SchemeBuilder{ 32 | kubeflowv1.AddToScheme, 33 | } 34 | 35 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 36 | // of clientsets, like in: 37 | // 38 | // import ( 39 | // "k8s.io/client-go/kubernetes" 40 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 41 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 42 | // ) 43 | // 44 | // kclientset, _ := kubernetes.NewForConfig(c) 45 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 46 | // 47 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 48 | // correctly. 49 | var AddToScheme = localSchemeBuilder.AddToScheme 50 | 51 | func init() { 52 | v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) 53 | utilruntime.Must(AddToScheme(scheme)) 54 | } 55 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/scheme/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package contains the scheme of the automatically generated clientset. 18 | package scheme 19 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/scheme/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package scheme 18 | 19 | import ( 20 | kubeflowv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 21 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 25 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 26 | ) 27 | 28 | var Scheme = runtime.NewScheme() 29 | var Codecs = serializer.NewCodecFactory(Scheme) 30 | var ParameterCodec = runtime.NewParameterCodec(Scheme) 31 | var localSchemeBuilder = runtime.SchemeBuilder{ 32 | kubeflowv1.AddToScheme, 33 | } 34 | 35 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 36 | // of clientsets, like in: 37 | // 38 | // import ( 39 | // "k8s.io/client-go/kubernetes" 40 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 41 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 42 | // ) 43 | // 44 | // kclientset, _ := kubernetes.NewForConfig(c) 45 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 46 | // 47 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 48 | // correctly. 49 | var AddToScheme = localSchemeBuilder.AddToScheme 50 | 51 | func init() { 52 | v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) 53 | utilruntime.Must(AddToScheme(Scheme)) 54 | } 55 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/typed/test_job/v1/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated typed clients. 18 | package v1 19 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/typed/test_job/v1/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // Package fake has the automatically generated clients. 18 | package fake 19 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/typed/test_job/v1/fake/fake_test_job_client.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | v1 "github.com/kubeflow/common/test_job/client/clientset/versioned/typed/test_job/v1" 21 | rest "k8s.io/client-go/rest" 22 | testing "k8s.io/client-go/testing" 23 | ) 24 | 25 | type FakeKubeflowV1 struct { 26 | *testing.Fake 27 | } 28 | 29 | func (c *FakeKubeflowV1) TestJobs(namespace string) v1.TestJobInterface { 30 | return &FakeTestJobs{c, namespace} 31 | } 32 | 33 | // RESTClient returns a RESTClient that is used to communicate 34 | // with API server by this client implementation. 35 | func (c *FakeKubeflowV1) RESTClient() rest.Interface { 36 | var ret *rest.RESTClient 37 | return ret 38 | } 39 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/typed/test_job/v1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | type TestJobExpansion interface{} 20 | -------------------------------------------------------------------------------- /test_job/client/clientset/versioned/typed/test_job/v1/test_job_client.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | import ( 20 | "net/http" 21 | 22 | v1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 23 | "github.com/kubeflow/common/test_job/client/clientset/versioned/scheme" 24 | rest "k8s.io/client-go/rest" 25 | ) 26 | 27 | type KubeflowV1Interface interface { 28 | RESTClient() rest.Interface 29 | TestJobsGetter 30 | } 31 | 32 | // KubeflowV1Client is used to interact with features provided by the kubeflow.org group. 33 | type KubeflowV1Client struct { 34 | restClient rest.Interface 35 | } 36 | 37 | func (c *KubeflowV1Client) TestJobs(namespace string) TestJobInterface { 38 | return newTestJobs(c, namespace) 39 | } 40 | 41 | // NewForConfig creates a new KubeflowV1Client for the given config. 42 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 43 | // where httpClient was generated with rest.HTTPClientFor(c). 44 | func NewForConfig(c *rest.Config) (*KubeflowV1Client, error) { 45 | config := *c 46 | if err := setConfigDefaults(&config); err != nil { 47 | return nil, err 48 | } 49 | httpClient, err := rest.HTTPClientFor(&config) 50 | if err != nil { 51 | return nil, err 52 | } 53 | return NewForConfigAndClient(&config, httpClient) 54 | } 55 | 56 | // NewForConfigAndClient creates a new KubeflowV1Client for the given config and http client. 57 | // Note the http client provided takes precedence over the configured transport values. 58 | func NewForConfigAndClient(c *rest.Config, h *http.Client) (*KubeflowV1Client, error) { 59 | config := *c 60 | if err := setConfigDefaults(&config); err != nil { 61 | return nil, err 62 | } 63 | client, err := rest.RESTClientForConfigAndClient(&config, h) 64 | if err != nil { 65 | return nil, err 66 | } 67 | return &KubeflowV1Client{client}, nil 68 | } 69 | 70 | // NewForConfigOrDie creates a new KubeflowV1Client for the given config and 71 | // panics if there is an error in the config. 72 | func NewForConfigOrDie(c *rest.Config) *KubeflowV1Client { 73 | client, err := NewForConfig(c) 74 | if err != nil { 75 | panic(err) 76 | } 77 | return client 78 | } 79 | 80 | // New creates a new KubeflowV1Client for the given RESTClient. 81 | func New(c rest.Interface) *KubeflowV1Client { 82 | return &KubeflowV1Client{c} 83 | } 84 | 85 | func setConfigDefaults(config *rest.Config) error { 86 | gv := v1.SchemeGroupVersion 87 | config.GroupVersion = &gv 88 | config.APIPath = "/apis" 89 | config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() 90 | 91 | if config.UserAgent == "" { 92 | config.UserAgent = rest.DefaultKubernetesUserAgent() 93 | } 94 | 95 | return nil 96 | } 97 | 98 | // RESTClient returns a RESTClient that is used to communicate 99 | // with API server by this client implementation. 100 | func (c *KubeflowV1Client) RESTClient() rest.Interface { 101 | if c == nil { 102 | return nil 103 | } 104 | return c.restClient 105 | } 106 | -------------------------------------------------------------------------------- /test_job/client/informers/externalversions/generic.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package externalversions 18 | 19 | import ( 20 | "fmt" 21 | 22 | v1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | cache "k8s.io/client-go/tools/cache" 25 | ) 26 | 27 | // GenericInformer is type of SharedIndexInformer which will locate and delegate to other 28 | // sharedInformers based on type 29 | type GenericInformer interface { 30 | Informer() cache.SharedIndexInformer 31 | Lister() cache.GenericLister 32 | } 33 | 34 | type genericInformer struct { 35 | informer cache.SharedIndexInformer 36 | resource schema.GroupResource 37 | } 38 | 39 | // Informer returns the SharedIndexInformer. 40 | func (f *genericInformer) Informer() cache.SharedIndexInformer { 41 | return f.informer 42 | } 43 | 44 | // Lister returns the GenericLister. 45 | func (f *genericInformer) Lister() cache.GenericLister { 46 | return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) 47 | } 48 | 49 | // ForResource gives generic access to a shared informer of the matching type 50 | // TODO extend this to unknown resources with a client pool 51 | func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { 52 | switch resource { 53 | // Group=kubeflow.org, Version=v1 54 | case v1.SchemeGroupVersion.WithResource("testjobs"): 55 | return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1().TestJobs().Informer()}, nil 56 | 57 | } 58 | 59 | return nil, fmt.Errorf("no informer found for %v", resource) 60 | } 61 | -------------------------------------------------------------------------------- /test_job/client/informers/externalversions/internalinterfaces/factory_interfaces.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package internalinterfaces 18 | 19 | import ( 20 | time "time" 21 | 22 | versioned "github.com/kubeflow/common/test_job/client/clientset/versioned" 23 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 | runtime "k8s.io/apimachinery/pkg/runtime" 25 | cache "k8s.io/client-go/tools/cache" 26 | ) 27 | 28 | // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. 29 | type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer 30 | 31 | // SharedInformerFactory a small interface to allow for adding an informer without an import cycle 32 | type SharedInformerFactory interface { 33 | Start(stopCh <-chan struct{}) 34 | InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer 35 | } 36 | 37 | // TweakListOptionsFunc is a function that transforms a v1.ListOptions. 38 | type TweakListOptionsFunc func(*v1.ListOptions) 39 | -------------------------------------------------------------------------------- /test_job/client/informers/externalversions/test_job/interface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package test_job 18 | 19 | import ( 20 | internalinterfaces "github.com/kubeflow/common/test_job/client/informers/externalversions/internalinterfaces" 21 | v1 "github.com/kubeflow/common/test_job/client/informers/externalversions/test_job/v1" 22 | ) 23 | 24 | // Interface provides access to each of this group's versions. 25 | type Interface interface { 26 | // V1 provides access to shared informers for resources in V1. 27 | V1() v1.Interface 28 | } 29 | 30 | type group struct { 31 | factory internalinterfaces.SharedInformerFactory 32 | namespace string 33 | tweakListOptions internalinterfaces.TweakListOptionsFunc 34 | } 35 | 36 | // New returns a new Interface. 37 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 38 | return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 39 | } 40 | 41 | // V1 returns a new v1.Interface. 42 | func (g *group) V1() v1.Interface { 43 | return v1.New(g.factory, g.namespace, g.tweakListOptions) 44 | } 45 | -------------------------------------------------------------------------------- /test_job/client/informers/externalversions/test_job/v1/interface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | import ( 20 | internalinterfaces "github.com/kubeflow/common/test_job/client/informers/externalversions/internalinterfaces" 21 | ) 22 | 23 | // Interface provides access to all the informers in this group version. 24 | type Interface interface { 25 | // TestJobs returns a TestJobInformer. 26 | TestJobs() TestJobInformer 27 | } 28 | 29 | type version struct { 30 | factory internalinterfaces.SharedInformerFactory 31 | namespace string 32 | tweakListOptions internalinterfaces.TweakListOptionsFunc 33 | } 34 | 35 | // New returns a new Interface. 36 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 37 | return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 38 | } 39 | 40 | // TestJobs returns a TestJobInformer. 41 | func (v *version) TestJobs() TestJobInformer { 42 | return &testJobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} 43 | } 44 | -------------------------------------------------------------------------------- /test_job/client/informers/externalversions/test_job/v1/testjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | import ( 20 | "context" 21 | time "time" 22 | 23 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 24 | versioned "github.com/kubeflow/common/test_job/client/clientset/versioned" 25 | internalinterfaces "github.com/kubeflow/common/test_job/client/informers/externalversions/internalinterfaces" 26 | v1 "github.com/kubeflow/common/test_job/client/listers/test_job/v1" 27 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 | runtime "k8s.io/apimachinery/pkg/runtime" 29 | watch "k8s.io/apimachinery/pkg/watch" 30 | cache "k8s.io/client-go/tools/cache" 31 | ) 32 | 33 | // TestJobInformer provides access to a shared informer and lister for 34 | // TestJobs. 35 | type TestJobInformer interface { 36 | Informer() cache.SharedIndexInformer 37 | Lister() v1.TestJobLister 38 | } 39 | 40 | type testJobInformer struct { 41 | factory internalinterfaces.SharedInformerFactory 42 | tweakListOptions internalinterfaces.TweakListOptionsFunc 43 | namespace string 44 | } 45 | 46 | // NewTestJobInformer constructs a new informer for TestJob type. 47 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 48 | // one. This reduces memory footprint and number of connections to the server. 49 | func NewTestJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { 50 | return NewFilteredTestJobInformer(client, namespace, resyncPeriod, indexers, nil) 51 | } 52 | 53 | // NewFilteredTestJobInformer constructs a new informer for TestJob type. 54 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 55 | // one. This reduces memory footprint and number of connections to the server. 56 | func NewFilteredTestJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { 57 | return cache.NewSharedIndexInformer( 58 | &cache.ListWatch{ 59 | ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 60 | if tweakListOptions != nil { 61 | tweakListOptions(&options) 62 | } 63 | return client.KubeflowV1().TestJobs(namespace).List(context.TODO(), options) 64 | }, 65 | WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 66 | if tweakListOptions != nil { 67 | tweakListOptions(&options) 68 | } 69 | return client.KubeflowV1().TestJobs(namespace).Watch(context.TODO(), options) 70 | }, 71 | }, 72 | &testjobv1.TestJob{}, 73 | resyncPeriod, 74 | indexers, 75 | ) 76 | } 77 | 78 | func (f *testJobInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { 79 | return NewFilteredTestJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) 80 | } 81 | 82 | func (f *testJobInformer) Informer() cache.SharedIndexInformer { 83 | return f.factory.InformerFor(&testjobv1.TestJob{}, f.defaultInformer) 84 | } 85 | 86 | func (f *testJobInformer) Lister() v1.TestJobLister { 87 | return v1.NewTestJobLister(f.Informer().GetIndexer()) 88 | } 89 | -------------------------------------------------------------------------------- /test_job/client/listers/test_job/v1/expansion_generated.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by lister-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | // TestJobListerExpansion allows custom methods to be added to 20 | // TestJobLister. 21 | type TestJobListerExpansion interface{} 22 | 23 | // TestJobNamespaceListerExpansion allows custom methods to be added to 24 | // TestJobNamespaceLister. 25 | type TestJobNamespaceListerExpansion interface{} 26 | -------------------------------------------------------------------------------- /test_job/client/listers/test_job/v1/testjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by lister-gen. DO NOT EDIT. 16 | 17 | package v1 18 | 19 | import ( 20 | v1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 21 | "k8s.io/apimachinery/pkg/api/errors" 22 | "k8s.io/apimachinery/pkg/labels" 23 | "k8s.io/client-go/tools/cache" 24 | ) 25 | 26 | // TestJobLister helps list TestJobs. 27 | // All objects returned here must be treated as read-only. 28 | type TestJobLister interface { 29 | // List lists all TestJobs in the indexer. 30 | // Objects returned here must be treated as read-only. 31 | List(selector labels.Selector) (ret []*v1.TestJob, err error) 32 | // TestJobs returns an object that can list and get TestJobs. 33 | TestJobs(namespace string) TestJobNamespaceLister 34 | TestJobListerExpansion 35 | } 36 | 37 | // testJobLister implements the TestJobLister interface. 38 | type testJobLister struct { 39 | indexer cache.Indexer 40 | } 41 | 42 | // NewTestJobLister returns a new TestJobLister. 43 | func NewTestJobLister(indexer cache.Indexer) TestJobLister { 44 | return &testJobLister{indexer: indexer} 45 | } 46 | 47 | // List lists all TestJobs in the indexer. 48 | func (s *testJobLister) List(selector labels.Selector) (ret []*v1.TestJob, err error) { 49 | err = cache.ListAll(s.indexer, selector, func(m interface{}) { 50 | ret = append(ret, m.(*v1.TestJob)) 51 | }) 52 | return ret, err 53 | } 54 | 55 | // TestJobs returns an object that can list and get TestJobs. 56 | func (s *testJobLister) TestJobs(namespace string) TestJobNamespaceLister { 57 | return testJobNamespaceLister{indexer: s.indexer, namespace: namespace} 58 | } 59 | 60 | // TestJobNamespaceLister helps list and get TestJobs. 61 | // All objects returned here must be treated as read-only. 62 | type TestJobNamespaceLister interface { 63 | // List lists all TestJobs in the indexer for a given namespace. 64 | // Objects returned here must be treated as read-only. 65 | List(selector labels.Selector) (ret []*v1.TestJob, err error) 66 | // Get retrieves the TestJob from the indexer for a given namespace and name. 67 | // Objects returned here must be treated as read-only. 68 | Get(name string) (*v1.TestJob, error) 69 | TestJobNamespaceListerExpansion 70 | } 71 | 72 | // testJobNamespaceLister implements the TestJobNamespaceLister 73 | // interface. 74 | type testJobNamespaceLister struct { 75 | indexer cache.Indexer 76 | namespace string 77 | } 78 | 79 | // List lists all TestJobs in the indexer for a given namespace. 80 | func (s testJobNamespaceLister) List(selector labels.Selector) (ret []*v1.TestJob, err error) { 81 | err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { 82 | ret = append(ret, m.(*v1.TestJob)) 83 | }) 84 | return ret, err 85 | } 86 | 87 | // Get retrieves the TestJob from the indexer for a given namespace and name. 88 | func (s testJobNamespaceLister) Get(name string) (*v1.TestJob, error) { 89 | obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) 90 | if err != nil { 91 | return nil, err 92 | } 93 | if !exists { 94 | return nil, errors.NewNotFound(v1.Resource("testjob"), name) 95 | } 96 | return obj.(*v1.TestJob), nil 97 | } 98 | -------------------------------------------------------------------------------- /test_job/controller.v1/test_job/test_job_controller.go: -------------------------------------------------------------------------------- 1 | package test_job 2 | 3 | import ( 4 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 5 | v1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 6 | log "github.com/sirupsen/logrus" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/runtime/schema" 10 | ) 11 | 12 | var _ commonv1.ControllerInterface = &TestJobController{} 13 | 14 | type TestJobController struct { 15 | commonv1.ControllerInterface 16 | Job *v1.TestJob 17 | Pods []*corev1.Pod 18 | Services []*corev1.Service 19 | } 20 | 21 | func (TestJobController) ControllerName() string { 22 | return "test-operator" 23 | } 24 | 25 | func (TestJobController) GetAPIGroupVersionKind() schema.GroupVersionKind { 26 | return v1.SchemeGroupVersionKind 27 | } 28 | 29 | func (TestJobController) GetAPIGroupVersion() schema.GroupVersion { 30 | return v1.SchemeGroupVersion 31 | } 32 | 33 | func (TestJobController) GetGroupNameLabelValue() string { 34 | return v1.GroupName 35 | } 36 | 37 | func (TestJobController) GetDefaultContainerPortName() string { 38 | return "default-port-name" 39 | } 40 | 41 | func (t *TestJobController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { 42 | return t.Job, nil 43 | } 44 | 45 | func (t *TestJobController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { 46 | return t.Job, nil 47 | } 48 | 49 | func (t *TestJobController) DeleteJob(job interface{}) error { 50 | log.Info("Delete job") 51 | t.Job = nil 52 | return nil 53 | } 54 | 55 | func (t *TestJobController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, 56 | jobStatus *commonv1.JobStatus) error { 57 | return nil 58 | } 59 | 60 | func (t *TestJobController) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { 61 | return nil 62 | } 63 | 64 | func (t *TestJobController) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error { 65 | return nil 66 | } 67 | 68 | func (t *TestJobController) GetDefaultContainerName() string { 69 | return "default-container" 70 | } 71 | 72 | func (t *TestJobController) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { 73 | return true 74 | } 75 | -------------------------------------------------------------------------------- /test_job/reconciler.v1/test_job/dummy_client.go: -------------------------------------------------------------------------------- 1 | package test_job 2 | 3 | import ( 4 | "context" 5 | 6 | "k8s.io/apimachinery/pkg/api/errors" 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | 9 | "k8s.io/apimachinery/pkg/api/meta" 10 | "k8s.io/apimachinery/pkg/runtime" 11 | "sigs.k8s.io/controller-runtime/pkg/client" 12 | ) 13 | 14 | type DummyClient struct { 15 | scheme *runtime.Scheme 16 | mapper meta.RESTMapper 17 | client.Reader 18 | client.Writer 19 | client.StatusClient 20 | Cache []client.Object 21 | } 22 | 23 | func (c *DummyClient) Scheme() *runtime.Scheme { 24 | return c.scheme 25 | } 26 | 27 | func (c *DummyClient) RESTMapper() meta.RESTMapper { 28 | return c.mapper 29 | } 30 | 31 | func (c *DummyClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { 32 | c.Cache = append(c.Cache, obj) 33 | return nil 34 | } 35 | 36 | func (c *DummyClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error { 37 | for idx, o := range c.Cache { 38 | if o.GetName() == obj.GetName() && o.GetNamespace() == obj.GetNamespace() && o.GetObjectKind() == obj.GetObjectKind() { 39 | c.Cache = append(c.Cache[:idx], c.Cache[idx+1:]...) 40 | return nil 41 | } 42 | } 43 | return errors.NewNotFound(schema.GroupResource{ 44 | Group: obj.GetObjectKind().GroupVersionKind().Group, 45 | Resource: obj.GetSelfLink(), 46 | }, obj.GetName()) 47 | } 48 | 49 | func (c *DummyClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { 50 | for idx, o := range c.Cache { 51 | if o.GetName() == obj.GetName() && o.GetNamespace() == obj.GetNamespace() && o.GetObjectKind() == obj.GetObjectKind() { 52 | c.Cache[idx] = obj 53 | return nil 54 | } 55 | } 56 | return errors.NewNotFound(schema.GroupResource{ 57 | Group: obj.GetObjectKind().GroupVersionKind().Group, 58 | Resource: obj.GetSelfLink(), 59 | }, obj.GetName()) 60 | } 61 | -------------------------------------------------------------------------------- /test_job/reconciler.v1/test_job/test_job_reconciler.go: -------------------------------------------------------------------------------- 1 | package test_job 2 | 3 | import ( 4 | "context" 5 | 6 | commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" 7 | common_reconciler "github.com/kubeflow/common/pkg/reconciler.v1/common" 8 | v1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 9 | "github.com/kubeflow/common/test_job/client/clientset/versioned/scheme" 10 | 11 | corev1 "k8s.io/api/core/v1" 12 | "k8s.io/apimachinery/pkg/runtime" 13 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 14 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 15 | ctrl "sigs.k8s.io/controller-runtime" 16 | "sigs.k8s.io/controller-runtime/pkg/client" 17 | "sigs.k8s.io/controller-runtime/pkg/log" 18 | ) 19 | 20 | type TestReconciler struct { 21 | common_reconciler.ReconcilerUtil 22 | common_reconciler.ServiceReconciler 23 | common_reconciler.PodReconciler 24 | common_reconciler.VolcanoReconciler 25 | common_reconciler.JobReconciler 26 | 27 | DC *DummyClient 28 | Job *v1.TestJob 29 | Pods []*corev1.Pod 30 | Services []*corev1.Service 31 | PodGroup client.Object 32 | } 33 | 34 | func NewTestReconciler() *TestReconciler { 35 | scheme := runtime.NewScheme() 36 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 37 | utilruntime.Must(v1.AddToScheme(scheme)) 38 | 39 | dummy_client := &DummyClient{} 40 | 41 | r := &TestReconciler{ 42 | DC: dummy_client, 43 | } 44 | 45 | // Generate Bare Components 46 | jobR := common_reconciler.BareJobReconciler(dummy_client) 47 | jobR.OverrideForJobInterface(r, r, r, r) 48 | 49 | podR := common_reconciler.BarePodReconciler(dummy_client) 50 | podR.OverrideForPodInterface(r, r, r) 51 | 52 | svcR := common_reconciler.BareServiceReconciler(dummy_client) 53 | svcR.OverrideForServiceInterface(r, r, r) 54 | 55 | gangR := common_reconciler.BareVolcanoReconciler(dummy_client, nil, false) 56 | gangR.OverrideForGangSchedulingInterface(r) 57 | 58 | Log := log.Log 59 | utilR := common_reconciler.BareUtilReconciler(nil, Log, scheme) 60 | //kubeflowReconciler := common_reconciler.BareKubeflowReconciler() 61 | 62 | r.JobReconciler = *jobR 63 | r.PodReconciler = *podR 64 | r.ServiceReconciler = *svcR 65 | r.VolcanoReconciler = *gangR 66 | r.ReconcilerUtil = *utilR 67 | 68 | return r 69 | } 70 | 71 | // Reconcile is part of the main kubernetes reconciliation loop which aims to 72 | // move the current state of the cluster closer to the desired state. 73 | func (r *TestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 74 | _ = log.FromContext(ctx) 75 | 76 | job, err := r.GetJob(ctx, req) 77 | if err != nil { 78 | return ctrl.Result{}, err 79 | } 80 | 81 | logger := r.GetLogger(job) 82 | 83 | if job.GetDeletionTimestamp() != nil { 84 | return ctrl.Result{}, nil 85 | } 86 | 87 | scheme.Scheme.Default(job) 88 | 89 | // Get rid of SatisfiedExpectation 90 | replicasSpec, err := r.ExtractReplicasSpec(job) 91 | if err != nil { 92 | return ctrl.Result{}, err 93 | } 94 | 95 | runPolicy, err := r.ExtractRunPolicy(job) 96 | if err != nil { 97 | return ctrl.Result{}, err 98 | } 99 | 100 | status, err := r.ExtractJobStatus(job) 101 | if err != nil { 102 | return ctrl.Result{}, err 103 | } 104 | 105 | err = r.ReconcileJob(ctx, job, replicasSpec, status, runPolicy) 106 | if err != nil { 107 | logger.Info("Reconcile Test Job error %v", err) 108 | return ctrl.Result{}, err 109 | } 110 | 111 | return ctrl.Result{}, nil 112 | } 113 | 114 | func (r *TestReconciler) GetReconcilerName() string { 115 | return "Test Reconciler" 116 | } 117 | 118 | func (r *TestReconciler) GetJob(ctx context.Context, req ctrl.Request) (client.Object, error) { 119 | return r.Job, nil 120 | } 121 | 122 | func (r *TestReconciler) GetDefaultContainerName() string { 123 | return v1.DefaultContainerName 124 | } 125 | 126 | func (r *TestReconciler) GetPodGroupForJob(ctx context.Context, job client.Object) (client.Object, error) { 127 | return r.PodGroup, nil 128 | } 129 | 130 | func (r *TestReconciler) GetPodsForJob(ctx context.Context, job client.Object) ([]*corev1.Pod, error) { 131 | return r.Pods, nil 132 | } 133 | 134 | func (r *TestReconciler) GetServicesForJob(ctx context.Context, job client.Object) ([]*corev1.Service, error) { 135 | return r.Services, nil 136 | } 137 | 138 | func (r *TestReconciler) ExtractReplicasSpec(job client.Object) (map[commonv1.ReplicaType]*commonv1.ReplicaSpec, error) { 139 | tj := job.(*v1.TestJob) 140 | 141 | rs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{} 142 | for k, v := range tj.Spec.TestReplicaSpecs { 143 | rs[commonv1.ReplicaType(k)] = v 144 | } 145 | 146 | return rs, nil 147 | } 148 | 149 | func (r *TestReconciler) ExtractRunPolicy(job client.Object) (*commonv1.RunPolicy, error) { 150 | tj := job.(*v1.TestJob) 151 | 152 | return tj.Spec.RunPolicy, nil 153 | } 154 | 155 | func (r *TestReconciler) ExtractJobStatus(job client.Object) (*commonv1.JobStatus, error) { 156 | tj := job.(*v1.TestJob) 157 | 158 | return &tj.Status, nil 159 | } 160 | 161 | func (r *TestReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { 162 | return string(rtype) == string(v1.TestReplicaTypeMaster) 163 | } 164 | -------------------------------------------------------------------------------- /test_job/test_util/v1/const.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "time" 19 | ) 20 | 21 | const ( 22 | TestImageName = "test-image-for-kubeflow-common:latest" 23 | TestJobName = "test-job" 24 | LabelWorker = "worker" 25 | 26 | SleepInterval = 500 * time.Millisecond 27 | ThreadCount = 1 28 | ) 29 | 30 | var ( 31 | AlwaysReady = func() bool { return true } 32 | ) 33 | -------------------------------------------------------------------------------- /test_job/test_util/v1/pod.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | "k8s.io/client-go/tools/cache" 24 | 25 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 26 | ) 27 | 28 | const ( 29 | // labels for pods and servers. 30 | testReplicaTypeLabel = "test-replica-type" 31 | testReplicaIndexLabel = "test-replica-index" 32 | ) 33 | 34 | var ( 35 | controllerKind = testjobv1.SchemeGroupVersionKind 36 | ) 37 | 38 | func NewBasePod(name string, testJob *testjobv1.TestJob, t *testing.T) *v1.Pod { 39 | return &v1.Pod{ 40 | ObjectMeta: metav1.ObjectMeta{ 41 | Name: name, 42 | Labels: GenLabels(testJob.Name), 43 | Namespace: testJob.Namespace, 44 | OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(testJob, controllerKind)}, 45 | }, 46 | } 47 | } 48 | 49 | func NewPod(testJob *testjobv1.TestJob, typ string, index int, t *testing.T) *v1.Pod { 50 | pod := NewBasePod(fmt.Sprintf("%s-%d", typ, index), testJob, t) 51 | pod.Labels[testReplicaTypeLabel] = typ 52 | pod.Labels[testReplicaIndexLabel] = fmt.Sprintf("%d", index) 53 | return pod 54 | } 55 | 56 | // create count pods with the given phase for the given testjob 57 | func NewPodList(count int32, status v1.PodPhase, testJob *testjobv1.TestJob, typ string, start int32, t *testing.T) []*v1.Pod { 58 | pods := []*v1.Pod{} 59 | for i := int32(0); i < count; i++ { 60 | newPod := NewPod(testJob, typ, int(start+i), t) 61 | newPod.Status = v1.PodStatus{Phase: status} 62 | pods = append(pods, newPod) 63 | } 64 | return pods 65 | } 66 | 67 | func SetPodsStatuses(podIndexer cache.Indexer, testJob *testjobv1.TestJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, restartCounts []int32, t *testing.T) { 68 | var index int32 69 | for _, pod := range NewPodList(pendingPods, v1.PodPending, testJob, typ, index, t) { 70 | if err := podIndexer.Add(pod); err != nil { 71 | t.Errorf("%s: unexpected error when adding pod %v", testJob.Name, err) 72 | } 73 | } 74 | index += pendingPods 75 | for i, pod := range NewPodList(activePods, v1.PodRunning, testJob, typ, index, t) { 76 | if restartCounts != nil { 77 | pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: restartCounts[i]}} 78 | } 79 | if err := podIndexer.Add(pod); err != nil { 80 | t.Errorf("%s: unexpected error when adding pod %v", testJob.Name, err) 81 | } 82 | } 83 | index += activePods 84 | for _, pod := range NewPodList(succeededPods, v1.PodSucceeded, testJob, typ, index, t) { 85 | if err := podIndexer.Add(pod); err != nil { 86 | t.Errorf("%s: unexpected error when adding pod %v", testJob.Name, err) 87 | } 88 | } 89 | index += succeededPods 90 | for _, pod := range NewPodList(failedPods, v1.PodFailed, testJob, typ, index, t) { 91 | if err := podIndexer.Add(pod); err != nil { 92 | t.Errorf("%s: unexpected error when adding pod %v", testJob.Name, err) 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /test_job/test_util/v1/service.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | "k8s.io/client-go/tools/cache" 24 | 25 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 26 | ) 27 | 28 | func NewBaseService(name string, testJob *testjobv1.TestJob, t *testing.T) *v1.Service { 29 | return &v1.Service{ 30 | ObjectMeta: metav1.ObjectMeta{ 31 | Name: name, 32 | Labels: GenLabels(testJob.Name), 33 | Namespace: testJob.Namespace, 34 | OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(testJob, controllerKind)}, 35 | }, 36 | } 37 | } 38 | 39 | func NewService(testJob *testjobv1.TestJob, typ string, index int, t *testing.T) *v1.Service { 40 | service := NewBaseService(fmt.Sprintf("%s-%d", typ, index), testJob, t) 41 | service.Labels[testReplicaTypeLabel] = typ 42 | service.Labels[testReplicaIndexLabel] = fmt.Sprintf("%d", index) 43 | return service 44 | } 45 | 46 | // NewServiceList creates count pods with the given phase for the given Job 47 | func NewServiceList(count int32, testJob *testjobv1.TestJob, typ string, t *testing.T) []*v1.Service { 48 | services := []*v1.Service{} 49 | for i := int32(0); i < count; i++ { 50 | newService := NewService(testJob, typ, int(i), t) 51 | services = append(services, newService) 52 | } 53 | return services 54 | } 55 | 56 | func SetServices(serviceIndexer cache.Indexer, testJob *testjobv1.TestJob, typ string, activeWorkerServices int32, t *testing.T) { 57 | for _, service := range NewServiceList(activeWorkerServices, testJob, typ, t) { 58 | if err := serviceIndexer.Add(service); err != nil { 59 | t.Errorf("unexpected error when adding service %v", err) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /test_job/test_util/v1/test_job_util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "time" 19 | 20 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | 24 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 25 | ) 26 | 27 | func NewTestJob(worker int) *testjobv1.TestJob { 28 | testJob := &testjobv1.TestJob{ 29 | TypeMeta: metav1.TypeMeta{ 30 | Kind: testjobv1.Kind, 31 | }, 32 | ObjectMeta: metav1.ObjectMeta{ 33 | Name: TestJobName, 34 | Namespace: metav1.NamespaceDefault, 35 | }, 36 | Spec: testjobv1.TestJobSpec{ 37 | TestReplicaSpecs: make(map[testjobv1.TestReplicaType]*apiv1.ReplicaSpec), 38 | }, 39 | } 40 | 41 | if worker > 0 { 42 | worker := int32(worker) 43 | workerReplicaSpec := &apiv1.ReplicaSpec{ 44 | Replicas: &worker, 45 | Template: NewTestReplicaSpecTemplate(), 46 | } 47 | testJob.Spec.TestReplicaSpecs[testjobv1.TestReplicaTypeWorker] = workerReplicaSpec 48 | } 49 | 50 | return testJob 51 | } 52 | 53 | func NewTestReplicaSpecTemplate() v1.PodTemplateSpec { 54 | return v1.PodTemplateSpec{ 55 | Spec: v1.PodSpec{ 56 | Containers: []v1.Container{ 57 | v1.Container{ 58 | Name: testjobv1.DefaultContainerName, 59 | Image: TestImageName, 60 | Args: []string{"Fake", "Fake"}, 61 | Ports: []v1.ContainerPort{ 62 | v1.ContainerPort{ 63 | Name: testjobv1.DefaultPortName, 64 | ContainerPort: testjobv1.DefaultPort, 65 | }, 66 | }, 67 | }, 68 | }, 69 | }, 70 | } 71 | } 72 | 73 | func SetTestJobCompletionTime(testJob *testjobv1.TestJob) { 74 | now := metav1.Time{Time: time.Now()} 75 | testJob.Status.CompletionTime = &now 76 | } 77 | -------------------------------------------------------------------------------- /test_job/test_util/v1/util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1 16 | 17 | import ( 18 | "strings" 19 | "testing" 20 | 21 | apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" 22 | v1 "k8s.io/api/core/v1" 23 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 | "k8s.io/client-go/tools/cache" 25 | 26 | testjobv1 "github.com/kubeflow/common/test_job/apis/test_job/v1" 27 | ) 28 | 29 | const ( 30 | LabelGroupName = "group-name" 31 | LabelTestJobName = "test-job-name" 32 | ) 33 | 34 | var ( 35 | // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. 36 | // IndexerInformer uses a delta queue, therefore for deletes we have to use this 37 | // key function but it should be just fine for non delete events. 38 | KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc 39 | TestGroupName = testjobv1.GroupName 40 | ) 41 | 42 | func GenLabels(jobName string) map[string]string { 43 | return map[string]string{ 44 | LabelGroupName: TestGroupName, 45 | LabelTestJobName: strings.Replace(jobName, "/", "-", -1), 46 | } 47 | } 48 | 49 | func GenOwnerReference(testjob *testjobv1.TestJob) *metav1.OwnerReference { 50 | boolPtr := func(b bool) *bool { return &b } 51 | controllerRef := &metav1.OwnerReference{ 52 | APIVersion: testjobv1.SchemeGroupVersion.String(), 53 | Kind: testjobv1.Kind, 54 | Name: testjob.Name, 55 | UID: testjob.UID, 56 | BlockOwnerDeletion: boolPtr(true), 57 | Controller: boolPtr(true), 58 | } 59 | 60 | return controllerRef 61 | } 62 | 63 | func GetKey(testJob *testjobv1.TestJob, t *testing.T) string { 64 | key, err := KeyFunc(testJob) 65 | if err != nil { 66 | t.Errorf("Unexpected error getting key for job %v: %v", testJob.Name, err) 67 | return "" 68 | } 69 | return key 70 | } 71 | 72 | func CheckCondition(testJob *testjobv1.TestJob, condition apiv1.JobConditionType, reason string) bool { 73 | for _, v := range testJob.Status.Conditions { 74 | if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { 75 | return true 76 | } 77 | } 78 | return false 79 | } 80 | --------------------------------------------------------------------------------