├── .github └── workflows │ └── ci-release.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cmd ├── collector-controller │ └── main.go └── kcover │ └── main.go ├── cspell.config.yaml ├── docker ├── agent.Dockerfile └── controller.Dockerfile ├── go.mod ├── go.sum ├── manifests └── kcover │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ ├── _common.tpl │ ├── _helpers.tpl │ ├── clusterrole.yaml │ ├── clusterrolebinding.yaml │ ├── daemonset.yaml │ ├── deployment.yaml │ └── serviceaccount.yaml │ └── values.yaml └── pkg ├── constants └── const.go ├── diagnosis ├── collector.go ├── controller │ └── controller.go ├── nvidiadiag │ └── nvidia_diag.go └── podstatus │ └── pod_status.go ├── events ├── events.go └── kubeevents.go ├── kube └── kube.go ├── recovery ├── job.go └── recovery.go └── runner └── runner.go /.github/workflows/ci-release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | tags: 7 | - '*' 8 | 9 | jobs: 10 | go-unit-test: 11 | runs-on: ubuntu-20.04 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Go 15 | uses: actions/setup-go@v4 16 | with: 17 | go-version: '1.23' 18 | - name: Test 19 | run: make test 20 | 21 | docker-build: 22 | runs-on: ubuntu-latest 23 | needs: [go-unit-test] 24 | if: startsWith(github.ref, 'refs/tags/v') 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v3 28 | - name: Set up QEMU 29 | uses: docker/setup-qemu-action@v2 30 | - name: Set up Docker Buildx 31 | uses: docker/setup-buildx-action@v2 32 | - name: Login Github Container registry 33 | uses: docker/login-action@v2 34 | with: 35 | registry: ghcr.io 36 | username: ${{ github.actor }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Docker meta Controller 40 | id: controller_meta 41 | uses: docker/metadata-action@v4 42 | with: 43 | images: | 44 | ghcr.io/baizeai/kcover-controller 45 | tags: | 46 | type=semver,pattern={{raw}} 47 | 48 | - name: Docker meta Agent 49 | id: agent_meta 50 | uses: docker/metadata-action@v4 51 | with: 52 | images: | 53 | ghcr.io/baizeai/kcover-agent 54 | tags: | 55 | type=semver,pattern={{raw}} 56 | 57 | - name: Build Image Controller 58 | uses: docker/build-push-action@v4 59 | with: 60 | context: . 61 | platforms: linux/amd64,linux/arm64 62 | push: true 63 | provenance: false 64 | tags: ${{ steps.controller_meta.outputs.tags }} 65 | labels: ${{ steps.controller_meta.outputs.labels }} 66 | cache-from: type=gha 67 | cache-to: type=gha,mode=max 68 | file: docker/controller.Dockerfile 69 | 70 | - name: Build Image Agent 71 | uses: docker/build-push-action@v4 72 | with: 73 | context: . 74 | platforms: linux/amd64,linux/arm64 75 | push: true 76 | provenance: false 77 | tags: ${{ steps.agent_meta.outputs.tags }} 78 | labels: ${{ steps.agent_meta.outputs.labels }} 79 | cache-from: type=gha 80 | cache-to: type=gha,mode=max 81 | file: docker/agent.Dockerfile 82 | 83 | publish-chart: 84 | if: startsWith(github.ref, 'refs/tags/v') 85 | needs: [ docker-build ] 86 | permissions: 87 | contents: write 88 | env: 89 | HELM_CHARTS_DIR: manifests/kcover 90 | HELM_CHART_NAME: kcover 91 | runs-on: ubuntu-latest 92 | steps: 93 | - name: Checkout 94 | uses: actions/checkout@v4 95 | 96 | - name: Install Helm 97 | uses: azure/setup-helm@v3 98 | 99 | - name: Get the version 100 | id: get_version 101 | run: | 102 | VERSION=${GITHUB_REF#refs/tags/} 103 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT 104 | 105 | - name: Tag helm chart image 106 | run: | 107 | image_tag=${{ steps.get_version.outputs.VERSION }} 108 | chart_version=${{ steps.get_version.outputs.VERSION }} 109 | sed -i "s/latest/${image_tag}/g" $HELM_CHARTS_DIR/values.yaml 110 | chart_smever=${chart_version#"v"} 111 | sed -i "s/0.1.0/${chart_smever}/g" $HELM_CHARTS_DIR/Chart.yaml 112 | 113 | - uses: getsentry/action-github-app-token@v2 114 | id: get_app_token 115 | with: 116 | app_id: ${{ secrets.APP_ID }} 117 | private_key: ${{ secrets.APP_PRIVATE_KEY }} 118 | - name: Sync Chart Repo 119 | run: | 120 | git config --global user.email "baize.ai[bot]@users.noreply.github.com" 121 | git config --global user.name "baize.ai[bot]" 122 | git clone https://x-access-token:${{ steps.get_app_token.outputs.token }}@github.com/BaizeAI/charts.git baize-charts 123 | helm package $HELM_CHARTS_DIR --destination ./baize-charts/docs/ 124 | helm repo index --url https://baizeai.github.io/charts ./baize-charts/docs/ 125 | cd baize-charts/ 126 | git add docs/ 127 | chart_version=${{ steps.get_version.outputs.VERSION }} 128 | chart_smever=${chart_version#"v"} 129 | git commit -m "update kcover chart ${chart_smever}" 130 | git push https://x-access-token:${{ steps.get_app_token.outputs.token }}@github.com/BaizeAI/charts.git 131 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Go 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | 9 | # Test binary, built with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | 15 | # Dependency directories (remove the comment below to include it) 16 | vendor/ 17 | 18 | .idea 19 | .vscode 20 | 21 | # logs 22 | _log 23 | 24 | *.local.yaml 25 | .DS_Store 26 | _output 27 | config/*.yaml 28 | apis/group/generate 29 | 30 | dist/ 31 | out/ 32 | bin 33 | .run/ 34 | .fleet 35 | 36 | # Temporary files 37 | temp/ 38 | tmp/ 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 @merbridge 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CONTAINER_CLI ?= docker 3 | 4 | HUB ?= release-ci.daocloud.io/baize 5 | 6 | VERSION ?= dev-$(shell git rev-parse --short=8 HEAD) 7 | 8 | image-%: 9 | $(CONTAINER_CLI) buildx build \ 10 | -t $(HUB)/kcover-$*:$(VERSION) \ 11 | -f docker/$*.Dockerfile \ 12 | --push \ 13 | --platform linux/amd64,linux/arm64 \ 14 | . 15 | 16 | images: image-controller image-agent 17 | 18 | test: 19 | KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out 20 | 21 | .PHONY: images 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kcover - Kubernetes Coverage for Fault Awareness and Recovery 2 | 3 | Welcome to `kcover`, a Kubernetes solution designed to enhance the reliability and resilience of large-scale AI workloads by providing fault awareness and robust instant recovery mechanisms. 4 | 5 | ## Features 6 | 7 | - **Fault Awareness**: Detect and respond to hardware, network, and software failures dynamically. 8 | - **Instant Recovery**: Quickly restore operations without manual intervention, minimizing downtime and ensuring continuous training and service availability. 9 | - **Scalability**: Designed for large-scale environments, handling complexities of distributed AI workloads. 10 | 11 | ## Getting Started 12 | 13 | ### Prerequisites 14 | 15 | Ensure you have Kubernetes and Helm installed on your cluster. `kcover` is compatible with Kubernetes versions 1.19 and above. 16 | 17 | ### Installation 18 | 19 | Install `kcover` using Helm: 20 | 21 | ```shell 22 | helm repo add baizeai https://baizeai.github.io/charts 23 | helm install kcover baizeai/kcover --namespace kcover-system --create-namespace 24 | ``` 25 | 26 | ### Configuration 27 | 28 | Configure `kcover` to monitor specific Kubernetes resources by labeling them: 29 | 30 | ```shell 31 | kubectl label pytorchjobs kcover.io/cascading-recovery=true 32 | kubectl label pytorchjobs kcover.io/need-recovery=true 33 | ``` 34 | 35 | ## Usage 36 | 37 | Once installed, `kcover` will automatically monitor the labeled resources for any signs of failures and perform recovery actions as specified in the configuration. 38 | -------------------------------------------------------------------------------- /cmd/collector-controller/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "os/signal" 6 | "syscall" 7 | 8 | "github.com/baizeai/kcover/pkg/diagnosis" 9 | "github.com/baizeai/kcover/pkg/diagnosis/nvidiadiag" 10 | "github.com/baizeai/kcover/pkg/events" 11 | "github.com/baizeai/kcover/pkg/kube" 12 | "k8s.io/client-go/kubernetes" 13 | "k8s.io/klog/v2" 14 | ) 15 | 16 | func main() { 17 | var hostName string 18 | if hn := os.Getenv("FAST_RECOVERY_NODE_NAME"); hn != "" { 19 | hostName = hn 20 | } else { 21 | hn, err := os.Hostname() 22 | if err != nil { 23 | panic(err) 24 | } 25 | hostName = hn 26 | } 27 | 28 | dcgmDiag, err := nvidiadiag.NewDCGMDiagnosis(hostName) 29 | if err != nil { 30 | panic(err) 31 | } 32 | 33 | diags := []diagnosis.Diagnostic{dcgmDiag} 34 | cfg := kube.GetK8sConfigConfigWithFile("", "") 35 | client := kubernetes.NewForConfigOrDie(cfg) 36 | recorder := events.NewKubeEventsRecorder(client, false) 37 | 38 | for _, d := range diags { 39 | if err := d.Start(); err != nil { 40 | panic(err) 41 | } 42 | 43 | klog.Infof("diag %T started", d) 44 | 45 | go func(d diagnosis.Diagnostic) { 46 | for e := range d.Events() { 47 | if err := recorder.RecordEvent(e); err != nil { 48 | klog.Errorf("record event %+v error: %v", e, err) 49 | } 50 | } 51 | }(d) 52 | } 53 | 54 | cc := make(chan os.Signal, 1) 55 | signal.Notify(cc, os.Interrupt, syscall.SIGTERM) 56 | <-cc 57 | klog.Info("collector stopped") 58 | } 59 | -------------------------------------------------------------------------------- /cmd/kcover/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "time" 7 | 8 | "github.com/baizeai/kcover/pkg/diagnosis/controller" 9 | "github.com/baizeai/kcover/pkg/events" 10 | "github.com/baizeai/kcover/pkg/kube" 11 | "github.com/baizeai/kcover/pkg/recovery" 12 | "github.com/baizeai/kcover/pkg/runner" 13 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 | "k8s.io/client-go/kubernetes" 15 | coordinationv1client "k8s.io/client-go/kubernetes/typed/coordination/v1" 16 | "k8s.io/client-go/tools/leaderelection" 17 | "k8s.io/client-go/tools/leaderelection/resourcelock" 18 | "k8s.io/klog/v2" 19 | ) 20 | 21 | func main() { 22 | hostName, err := os.Hostname() 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | cfg := kube.GetK8sConfigConfigWithFile("", "") 28 | client := kubernetes.NewForConfigOrDie(cfg) 29 | var eventBus events.Recorder 30 | var rec runner.Runner 31 | var diag runner.Runner 32 | leaderElectionConfig := leaderelection.LeaderElectionConfig{ 33 | Lock: &resourcelock.LeaseLock{ 34 | Client: coordinationv1client.NewForConfigOrDie(kube.GetK8sConfigConfigWithFile("", "")), 35 | LeaseMeta: metav1.ObjectMeta{ 36 | Name: "kcover", 37 | Namespace: func() string { 38 | if bs, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"); err == nil { 39 | return string(bs) 40 | } 41 | return "default" 42 | }(), 43 | }, 44 | LockConfig: resourcelock.ResourceLockConfig{ 45 | Identity: hostName, 46 | }, 47 | }, 48 | ReleaseOnCancel: true, 49 | LeaseDuration: 15 * time.Second, 50 | RenewDeadline: 10 * time.Second, 51 | RetryPeriod: 2 * time.Second, 52 | Callbacks: leaderelection.LeaderCallbacks{ 53 | OnStartedLeading: func(ctx context.Context) { 54 | // 当当前实例成为 leader 时,开始执行 controller 逻辑 55 | var err error 56 | eventBus = events.NewKubeEventsRecorder(client, true) 57 | rec = recovery.NewRecoveryController(client, eventBus) 58 | diag, err = controller.NewControllerDiagnostic(client, eventBus) 59 | if err != nil { 60 | panic(err) 61 | } 62 | if err := rec.Start(); err != nil { 63 | panic(err) 64 | } 65 | if err := diag.Start(); err != nil { 66 | panic(err) 67 | } 68 | if err := eventBus.Start(); err != nil { 69 | panic(err) 70 | } 71 | 72 | klog.Info("kcover started") 73 | }, 74 | OnStoppedLeading: func() { 75 | rec.Stop() 76 | diag.Stop() 77 | eventBus.Stop() 78 | klog.Info("kcover stopped") 79 | }, 80 | }, 81 | } 82 | 83 | leaderelection.RunOrDie(context.Background(), leaderElectionConfig) 84 | } 85 | -------------------------------------------------------------------------------- /cspell.config.yaml: -------------------------------------------------------------------------------- 1 | version: "0.2" 2 | ignorePaths: [] 3 | dictionaryDefinitions: [] 4 | dictionaries: [] 5 | words: 6 | - apimachinery 7 | - automount 8 | - baizeai 9 | - clientcmd 10 | - containerlogs 11 | - coordinationv1client 12 | - corev1 13 | - CUDA 14 | - CUDNN 15 | - daocloud 16 | - dcgm 17 | - dcgmi 18 | - exitcode 19 | - fullname 20 | - GOARCH 21 | - Infof 22 | - jellydator 23 | - kcover 24 | - klog 25 | - KUBECONFIG 26 | - Kubeflow 27 | - ldflags 28 | - leaderelection 29 | - metav1 30 | - NCCL 31 | - nvidiadiag 32 | - podstatus 33 | - pytorchjobs 34 | - resourcelock 35 | - samber 36 | - serviceaccount 37 | - stretchr 38 | - tfjobs 39 | - ttlcache 40 | - Warningf 41 | ignoreWords: [] 42 | import: [] 43 | -------------------------------------------------------------------------------- /docker/agent.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$BUILDPLATFORM m.daocloud.io/docker.io/golang:1.23.2 as builder 2 | 3 | WORKDIR /app 4 | 5 | COPY go.mod /app/go.mod 6 | COPY go.sum /app/go.sum 7 | 8 | RUN go env 9 | RUN go env -w CGO_ENABLED=0 10 | RUN go mod download 11 | 12 | ADD . . 13 | 14 | ARG TARGETARCH 15 | 16 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH go build -ldflags "-s -w" -o kcover-agent ./cmd/collector-controller 17 | 18 | # runner 19 | FROM m.daocloud.io/docker.io/ubuntu:22.04 20 | 21 | WORKDIR /app 22 | 23 | # todo install dcgm toolkit? 24 | 25 | COPY --from=builder /app/kcover-agent kcover-agent 26 | 27 | CMD /app/kcover-agent 28 | -------------------------------------------------------------------------------- /docker/controller.Dockerfile: -------------------------------------------------------------------------------- 1 | # builder 2 | FROM --platform=$BUILDPLATFORM m.daocloud.io/docker.io/golang:1.23.2 as builder 3 | 4 | WORKDIR /app 5 | 6 | COPY go.mod /app/go.mod 7 | COPY go.sum /app/go.sum 8 | 9 | RUN go env 10 | RUN go env -w CGO_ENABLED=0 11 | RUN go mod download 12 | 13 | ADD . . 14 | 15 | ARG TARGETARCH 16 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH go build -ldflags "-s -w" -o kcover-controller ./cmd/kcover 17 | 18 | # runner 19 | FROM m.daocloud.io/docker.io/ubuntu:22.04 20 | 21 | WORKDIR /app 22 | 23 | COPY --from=builder /app/kcover-controller kcover-controller 24 | 25 | CMD /app/kcover-controller 26 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/baizeai/kcover 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.23.2 6 | 7 | require ( 8 | github.com/jellydator/ttlcache/v3 v3.3.0 9 | github.com/samber/lo v1.47.0 10 | k8s.io/api v0.32.0 11 | k8s.io/apimachinery v0.32.0 12 | k8s.io/client-go v0.32.0 13 | k8s.io/klog/v2 v2.130.1 14 | ) 15 | 16 | require ( 17 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 18 | github.com/emicklei/go-restful/v3 v3.12.1 // indirect 19 | github.com/fxamacker/cbor/v2 v2.7.0 // indirect 20 | github.com/go-logr/logr v1.4.2 // indirect 21 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 22 | github.com/go-openapi/jsonreference v0.21.0 // indirect 23 | github.com/go-openapi/swag v0.23.0 // indirect 24 | github.com/gogo/protobuf v1.3.2 // indirect 25 | github.com/golang/protobuf v1.5.4 // indirect 26 | github.com/google/gnostic-models v0.6.9 // indirect 27 | github.com/google/go-cmp v0.6.0 // indirect 28 | github.com/google/gofuzz v1.2.0 // indirect 29 | github.com/google/uuid v1.6.0 // indirect 30 | github.com/josharian/intern v1.0.0 // indirect 31 | github.com/json-iterator/go v1.1.12 // indirect 32 | github.com/mailru/easyjson v0.9.0 // indirect 33 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 34 | github.com/modern-go/reflect2 v1.0.2 // indirect 35 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 36 | github.com/pkg/errors v0.9.1 // indirect 37 | github.com/spf13/pflag v1.0.5 // indirect 38 | github.com/x448/float16 v0.8.4 // indirect 39 | golang.org/x/net v0.33.0 // indirect 40 | golang.org/x/oauth2 v0.24.0 // indirect 41 | golang.org/x/sync v0.10.0 // indirect 42 | golang.org/x/sys v0.28.0 // indirect 43 | golang.org/x/term v0.27.0 // indirect 44 | golang.org/x/text v0.21.0 // indirect 45 | golang.org/x/time v0.8.0 // indirect 46 | golang.org/x/tools v0.28.0 // indirect 47 | google.golang.org/protobuf v1.36.0 // indirect 48 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 49 | gopkg.in/inf.v0 v0.9.1 // indirect 50 | gopkg.in/yaml.v3 v3.0.1 // indirect 51 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect 52 | k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect 53 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 54 | sigs.k8s.io/structured-merge-diff/v4 v4.5.0 // indirect 55 | sigs.k8s.io/yaml v1.4.0 // indirect 56 | ) 57 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= 4 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= 6 | github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= 7 | github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= 8 | github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= 9 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 10 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 11 | github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= 12 | github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= 13 | github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= 14 | github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= 15 | github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= 16 | github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= 17 | github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= 18 | github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= 19 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= 20 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= 21 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 22 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 23 | github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= 24 | github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= 25 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 26 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 27 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 28 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 29 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= 30 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 31 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= 32 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= 33 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 34 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 35 | github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= 36 | github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= 37 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= 38 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= 39 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= 40 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= 41 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= 42 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 43 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 44 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 45 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 46 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 47 | github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= 48 | github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= 49 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 50 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= 51 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 52 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= 53 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 54 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 55 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 56 | github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= 57 | github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= 58 | github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= 59 | github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= 60 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 61 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 62 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 63 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= 64 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 65 | github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= 66 | github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= 67 | github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc= 68 | github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= 69 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 70 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 71 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 72 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 73 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 74 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 75 | github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= 76 | github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= 77 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 78 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 79 | go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= 80 | go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= 81 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 82 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 83 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 84 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 85 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 86 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 87 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 88 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 89 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 90 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= 91 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 92 | golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= 93 | golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= 94 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 95 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 96 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 97 | golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= 98 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 99 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 100 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 101 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 102 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= 103 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 104 | golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= 105 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 106 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 107 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 108 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= 109 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 110 | golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg= 111 | golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 112 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 113 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 114 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 115 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 116 | golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= 117 | golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= 118 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 119 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 120 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 121 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 122 | google.golang.org/protobuf v1.36.0 h1:mjIs9gYtt56AzC4ZaffQuh88TZurBGhIJMBZGSxNerQ= 123 | google.golang.org/protobuf v1.36.0/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= 124 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 125 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 126 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 127 | gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= 128 | gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= 129 | gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= 130 | gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= 131 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 132 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 133 | k8s.io/api v0.32.0 h1:OL9JpbvAU5ny9ga2fb24X8H6xQlVp+aJMFlgtQjR9CE= 134 | k8s.io/api v0.32.0/go.mod h1:4LEwHZEf6Q/cG96F3dqR965sYOfmPM7rq81BLgsE0p0= 135 | k8s.io/apimachinery v0.32.0 h1:cFSE7N3rmEEtv4ei5X6DaJPHHX0C+upp+v5lVPiEwpg= 136 | k8s.io/apimachinery v0.32.0/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= 137 | k8s.io/client-go v0.32.0 h1:DimtMcnN/JIKZcrSrstiwvvZvLjG0aSxy8PxN8IChp8= 138 | k8s.io/client-go v0.32.0/go.mod h1:boDWvdM1Drk4NJj/VddSLnx59X3OPgwrOo0vGbtq9+8= 139 | k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= 140 | k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= 141 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 h1:hcha5B1kVACrLujCKLbr8XWMxCxzQx42DY8QKYJrDLg= 142 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7/go.mod h1:GewRfANuJ70iYzvn+i4lezLDAFzvjxZYK1gn1lWcfas= 143 | k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= 144 | k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= 145 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= 146 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= 147 | sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= 148 | sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= 149 | sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 150 | sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 151 | -------------------------------------------------------------------------------- /manifests/kcover/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /manifests/kcover/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: kcover 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: "0.1.0" 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "0.1.0" 25 | -------------------------------------------------------------------------------- /manifests/kcover/templates/_common.tpl: -------------------------------------------------------------------------------- 1 | {{- define "common.images.image" -}} 2 | {{- $registryName := .imageRoot.registry -}} 3 | {{- $repositoryName := .imageRoot.repository -}} 4 | {{- $tag := .defaultTag -}} 5 | {{- if .global }} 6 | {{- if .global.imageRegistry }} 7 | {{- $registryName = .global.imageRegistry -}} 8 | {{- end -}} 9 | {{- end -}} 10 | {{- if .imageRoot.registry }} 11 | {{- $registryName = .imageRoot.registry -}} 12 | {{- end -}} 13 | {{- if .imageRoot.tag }} 14 | {{- $tag = .imageRoot.tag -}} 15 | {{- end -}} 16 | {{- if $registryName }} 17 | {{- printf "%s/%s:%s" $registryName $repositoryName $tag -}} 18 | {{- else -}} 19 | {{- printf "%s:%s" $repositoryName $tag -}} 20 | {{- end -}} 21 | {{- end -}} 22 | -------------------------------------------------------------------------------- /manifests/kcover/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "kcover.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "kcover.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "kcover.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "kcover.labels" -}} 37 | helm.sh/chart: {{ include "kcover.chart" . }} 38 | {{ include "kcover.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "kcover.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "kcover.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "kcover.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "kcover.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | 64 | {{- define "controller.image" -}} 65 | {{ include "common.images.image" (dict "imageRoot" .Values.controller.image "global" .Values.global "defaultTag" .Chart.Version) }} 66 | {{- end -}} 67 | 68 | {{- define "agent.image" -}} 69 | {{ include "common.images.image" (dict "imageRoot" .Values.agent.image "global" .Values.global "defaultTag" .Chart.Version) }} 70 | {{- end -}} 71 | -------------------------------------------------------------------------------- /manifests/kcover/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "kcover.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app: {{ include "kcover.serviceAccountName" . }} 8 | rules: 9 | # Cluster resources 10 | - apiGroups: 11 | - "" 12 | resources: 13 | - nodes 14 | verbs: 15 | - get 16 | - list 17 | - watch 18 | - update 19 | - apiGroups: 20 | - "" 21 | resources: 22 | - events 23 | verbs: 24 | - '*' 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - namespaces 29 | verbs: 30 | - get 31 | - list 32 | - watch 33 | 34 | # Core v1 Pods 35 | - apiGroups: 36 | - "" 37 | resources: 38 | - pods 39 | - pods/logs 40 | verbs: 41 | - '*' 42 | 43 | # Batch v1 Jobs 44 | - apiGroups: 45 | - batch 46 | resources: 47 | - jobs 48 | verbs: 49 | - get 50 | - list 51 | - watch 52 | 53 | # Kubeflow.org 54 | - apiGroups: 55 | - kubeflow.org 56 | resources: 57 | - paddlejobs 58 | verbs: 59 | - get 60 | - list 61 | - watch 62 | - apiGroups: 63 | - kubeflow.org 64 | resources: 65 | - mpijobs 66 | verbs: 67 | - get 68 | - list 69 | - watch 70 | - apiGroups: 71 | - kubeflow.org 72 | resources: 73 | - mxjobs 74 | verbs: 75 | - get 76 | - list 77 | - watch 78 | - apiGroups: 79 | - kubeflow.org 80 | resources: 81 | - paddlejobs 82 | verbs: 83 | - get 84 | - list 85 | - watch 86 | - apiGroups: 87 | - kubeflow.org 88 | resources: 89 | - pytorchjobs 90 | verbs: 91 | - get 92 | - list 93 | - watch 94 | - apiGroups: 95 | - kubeflow.org 96 | resources: 97 | - tfjobs 98 | verbs: 99 | - get 100 | - list 101 | - watch 102 | - apiGroups: 103 | - kubeflow.org 104 | resources: 105 | - xgboostjobs 106 | verbs: 107 | - get 108 | - list 109 | - watch 110 | - apiGroups: 111 | - coordination.k8s.io 112 | resources: 113 | - leases 114 | verbs: 115 | - '*' 116 | -------------------------------------------------------------------------------- /manifests/kcover/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "kcover.fullname" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app: {{ include "kcover.fullname" . }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: {{ include "kcover.fullname" . }} 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "kcover.serviceAccountName" . }} 15 | namespace: {{ .Release.Namespace }} 16 | -------------------------------------------------------------------------------- /manifests/kcover/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: {{ include "kcover.fullname" . }}-agent 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "kcover.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: {{ include "kcover.fullname" . }}-agent 12 | template: 13 | metadata: 14 | {{- with .Values.agent.podAnnotations }} 15 | annotations: 16 | {{- toYaml . | nindent 8 }} 17 | {{- end }} 18 | labels: 19 | app: {{ include "kcover.fullname" . }}-agent 20 | spec: 21 | {{- with .Values.agent.imagePullSecrets }} 22 | imagePullSecrets: 23 | {{- toYaml . | nindent 8 }} 24 | {{- end }} 25 | serviceAccountName: {{ include "kcover.serviceAccountName" . }} 26 | securityContext: 27 | {{- toYaml .Values.agent.podSecurityContext | nindent 8 }} 28 | containers: 29 | - name: agent 30 | securityContext: 31 | {{- toYaml .Values.agent.securityContext | nindent 12 }} 32 | image: {{ template "agent.image" . }} 33 | imagePullPolicy: {{ .Values.agent.image.pullPolicy }} 34 | env: 35 | - name: FAST_RECOVERY_NODE_NAME 36 | valueFrom: 37 | fieldRef: 38 | apiVersion: v1 39 | fieldPath: spec.nodeName 40 | resources: 41 | {{- toYaml .Values.agent.resources | nindent 12 }} 42 | -------------------------------------------------------------------------------- /manifests/kcover/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "kcover.fullname" . }}-controller 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "kcover.labels" . | nindent 4 }} 8 | spec: 9 | replicas: {{ .Values.controller.replicas }} 10 | selector: 11 | matchLabels: 12 | app: {{ include "kcover.fullname" . }}-controller 13 | strategy: 14 | rollingUpdate: 15 | maxSurge: 25% 16 | maxUnavailable: 25% 17 | type: RollingUpdate 18 | template: 19 | metadata: 20 | {{- with .Values.controller.podAnnotations }} 21 | annotations: 22 | {{- toYaml . | nindent 8 }} 23 | {{- end }} 24 | labels: 25 | app: {{ include "kcover.fullname" . }}-controller 26 | spec: 27 | {{- with .Values.controller.imagePullSecrets }} 28 | imagePullSecrets: 29 | {{- toYaml . | nindent 8 }} 30 | {{- end }} 31 | serviceAccountName: {{ include "kcover.serviceAccountName" . }} 32 | securityContext: 33 | {{- toYaml .Values.controller.podSecurityContext | nindent 8 }} 34 | containers: 35 | - name: controller-container 36 | image: {{ template "controller.image" . }} 37 | imagePullPolicy: {{ .Values.controller.image.pullPolicy }} 38 | env: 39 | - name: FAST_RECOVERY_NODE_NAME 40 | valueFrom: 41 | fieldRef: 42 | apiVersion: v1 43 | fieldPath: spec.nodeName 44 | resources: 45 | {{- toYaml .Values.controller.resources | nindent 12 }} 46 | securityContext: 47 | {{- toYaml .Values.controller.securityContext | nindent 12 }} 48 | -------------------------------------------------------------------------------- /manifests/kcover/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "kcover.serviceAccountName" . }} 6 | labels: 7 | {{- include "kcover.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /manifests/kcover/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for kcover. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | fullnameOverride: "" 6 | nameOverride: "" 7 | 8 | serviceAccount: 9 | # Specifies whether a service account should be created 10 | create: true 11 | # Automatically mount a ServiceAccount's API credentials? 12 | automount: true 13 | # Annotations to add to the service account 14 | annotations: {} 15 | # The name of the service account to use. 16 | # If not set and create is true, a name is generated using the fullname template 17 | name: "" 18 | 19 | global: 20 | imageRegistry: ghcr.io 21 | 22 | agent: 23 | image: 24 | registry: '' 25 | repository: baizeai/kcover-agent 26 | pullPolicy: IfNotPresent 27 | # Overrides the image tag whose default is the chart appVersion. 28 | tag: "latest" 29 | 30 | imagePullSecrets: [] 31 | 32 | podAnnotations: {} 33 | podLabels: {} 34 | podSecurityContext: {} 35 | # fsGroup: 2000 36 | securityContext: {} 37 | # capabilities: 38 | # drop: 39 | # - ALL 40 | # readOnlyRootFilesystem: true 41 | # runAsNonRoot: true 42 | # runAsUser: 1000 43 | 44 | resources: {} 45 | # We usually recommend not to specify default resources and to leave this as a conscious 46 | # choice for the user. This also increases chances charts run on environments with little 47 | # resources, such as Minikube. If you do want to specify resources, uncomment the following 48 | # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 49 | # limits: 50 | # cpu: 100m 51 | # memory: 128Mi 52 | # requests: 53 | # cpu: 100m 54 | # memory: 128Mi 55 | 56 | autoscaling: 57 | enabled: false 58 | minReplicas: 1 59 | maxReplicas: 100 60 | targetCPUUtilizationPercentage: 80 61 | # targetMemoryUtilizationPercentage: 80 62 | 63 | # Additional volumes on the output Deployment definition. 64 | volumes: [] 65 | # - name: foo 66 | # secret: 67 | # secretName: mysecret 68 | # optional: false 69 | 70 | # Additional volumeMounts on the output Deployment definition. 71 | volumeMounts: [] 72 | # - name: foo 73 | # mountPath: "/etc/foo" 74 | # readOnly: true 75 | 76 | nodeSelector: {} 77 | tolerations: [] 78 | affinity: {} 79 | 80 | controller: 81 | image: 82 | registry: '' 83 | repository: baizeai/kcover-controller 84 | pullPolicy: IfNotPresent 85 | # Overrides the image tag whose default is the chart appVersion. 86 | tag: "latest" 87 | 88 | imagePullSecrets: [] 89 | 90 | replicas: 1 91 | podAnnotations: {} 92 | podLabels: {} 93 | podSecurityContext: {} 94 | # fsGroup: 2000 95 | securityContext: {} 96 | # capabilities: 97 | # drop: 98 | # - ALL 99 | # readOnlyRootFilesystem: true 100 | # runAsNonRoot: true 101 | # runAsUser: 1000 102 | 103 | resources: {} 104 | # We usually recommend not to specify default resources and to leave this as a conscious 105 | # choice for the user. This also increases chances charts run on environments with little 106 | # resources, such as Minikube. If you do want to specify resources, uncomment the following 107 | # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 108 | # limits: 109 | # cpu: 100m 110 | # memory: 128Mi 111 | # requests: 112 | # cpu: 100m 113 | # memory: 128Mi 114 | 115 | autoscaling: 116 | enabled: false 117 | minReplicas: 1 118 | maxReplicas: 100 119 | targetCPUUtilizationPercentage: 80 120 | # targetMemoryUtilizationPercentage: 80 121 | 122 | # Additional volumes on the output Deployment definition. 123 | volumes: [] 124 | # - name: foo 125 | # secret: 126 | # secretName: mysecret 127 | # optional: false 128 | 129 | # Additional volumeMounts on the output Deployment definition. 130 | volumeMounts: [] 131 | # - name: foo 132 | # mountPath: "/etc/foo" 133 | # readOnly: true 134 | 135 | nodeSelector: {} 136 | tolerations: [] 137 | affinity: {} 138 | -------------------------------------------------------------------------------- /pkg/constants/const.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const ( 4 | KubeflowJobLabel = "training.kubeflow.org/job-name" 5 | 6 | // recovery annotations 7 | NeedRecoveryAnnotation = "kcover.io/need-recovery" 8 | 9 | EnabledRecoveryLabel = "kcover.io/cascading-recovery" 10 | 11 | True = "true" 12 | ) 13 | -------------------------------------------------------------------------------- /pkg/diagnosis/collector.go: -------------------------------------------------------------------------------- 1 | package diagnosis 2 | 3 | import ( 4 | "github.com/baizeai/kcover/pkg/events" 5 | "github.com/baizeai/kcover/pkg/runner" 6 | ) 7 | 8 | type Diagnostic interface { 9 | runner.Runner 10 | Events() <-chan events.CollectorEvent 11 | } 12 | -------------------------------------------------------------------------------- /pkg/diagnosis/controller/controller.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/baizeai/kcover/pkg/diagnosis" 7 | "github.com/baizeai/kcover/pkg/diagnosis/podstatus" 8 | "github.com/baizeai/kcover/pkg/events" 9 | "github.com/baizeai/kcover/pkg/runner" 10 | "k8s.io/client-go/kubernetes" 11 | "k8s.io/klog/v2" 12 | ) 13 | 14 | var _ runner.Runner = (*controllerDiagnostic)(nil) 15 | 16 | type controllerDiagnostic struct { 17 | diagnostics []diagnosis.Diagnostic 18 | recorder events.Recorder 19 | } 20 | 21 | func NewControllerDiagnostic(cli kubernetes.Interface, recorder events.Recorder) (runner.Runner, error) { 22 | diags := make([]diagnosis.Diagnostic, 0) 23 | 24 | diagPodCollector, err := podstatus.NewPodStatusCollector(cli) 25 | if err != nil { 26 | return nil, fmt.Errorf("failed to create pod status collector: %v", err) 27 | } 28 | 29 | diags = append(diags, diagPodCollector) 30 | 31 | if recorder == nil { 32 | return nil, fmt.Errorf("recorder can not be nil") 33 | } 34 | 35 | return &controllerDiagnostic{ 36 | diagnostics: diags, 37 | recorder: recorder, 38 | }, nil 39 | } 40 | 41 | func (c *controllerDiagnostic) Start() error { 42 | for _, d := range c.diagnostics { 43 | if err := d.Start(); err != nil { 44 | return err 45 | } 46 | } 47 | for _, d := range c.diagnostics { 48 | go func(d diagnosis.Diagnostic) { 49 | for e := range d.Events() { 50 | err := c.recorder.RecordEvent(e) 51 | if err != nil { 52 | klog.Errorf("failed to record event of %T: %v", d, err) 53 | } 54 | } 55 | }(d) 56 | } 57 | 58 | return nil 59 | } 60 | 61 | func (c *controllerDiagnostic) Stop() { 62 | for _, d := range c.diagnostics { 63 | d.Stop() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /pkg/diagnosis/nvidiadiag/nvidia_diag.go: -------------------------------------------------------------------------------- 1 | package nvidiadiag 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/baizeai/kcover/pkg/diagnosis" 7 | "github.com/baizeai/kcover/pkg/events" 8 | "github.com/baizeai/kcover/pkg/runner" 9 | "k8s.io/klog/v2" 10 | ) 11 | 12 | var _ runner.Runner = (*dcgmDiag)(nil) 13 | var _ diagnosis.Diagnostic = (*dcgmDiag)(nil) 14 | 15 | type dcgmDiag struct { 16 | nodeName string 17 | events chan events.CollectorEvent 18 | stop chan struct{} 19 | } 20 | 21 | func NewDCGMDiagnosis(nodeName string) (diagnosis.Diagnostic, error) { 22 | return &dcgmDiag{ 23 | events: make(chan events.CollectorEvent), 24 | stop: make(chan struct{}), 25 | nodeName: nodeName, 26 | }, nil 27 | } 28 | 29 | func (d *dcgmDiag) Start() error { 30 | go func() { 31 | t := time.NewTicker(time.Second * 30) 32 | defer t.Stop() 33 | for { 34 | select { 35 | case <-t.C: 36 | // run dcgmi 37 | // parse results 38 | klog.Infof("start dcgmi diag -r 1") 39 | //d.events <- events.CollectorEvent{ 40 | // TargetType: events.Node, 41 | // Name: "worker-a800-2", 42 | // EventType: events.Error, 43 | // Message: "test event for worker-a800-2", 44 | //} 45 | case <-d.stop: 46 | return 47 | } 48 | } 49 | }() 50 | return nil 51 | } 52 | 53 | func (d *dcgmDiag) Stop() { 54 | close(d.stop) 55 | } 56 | 57 | func (d *dcgmDiag) Events() <-chan events.CollectorEvent { 58 | return d.events 59 | } 60 | -------------------------------------------------------------------------------- /pkg/diagnosis/podstatus/pod_status.go: -------------------------------------------------------------------------------- 1 | package podstatus 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "time" 7 | 8 | "github.com/baizeai/kcover/pkg/constants" 9 | "github.com/baizeai/kcover/pkg/diagnosis" 10 | "github.com/baizeai/kcover/pkg/events" 11 | "github.com/baizeai/kcover/pkg/runner" 12 | corev1 "k8s.io/api/core/v1" 13 | "k8s.io/client-go/informers" 14 | "k8s.io/client-go/kubernetes" 15 | "k8s.io/client-go/tools/cache" 16 | ) 17 | 18 | var _ runner.Runner = (*podStatusCollector)(nil) 19 | var _ diagnosis.Diagnostic = (*podStatusCollector)(nil) 20 | 21 | type podStatusCollector struct { 22 | client kubernetes.Interface 23 | eventsChan chan events.CollectorEvent 24 | stop chan struct{} 25 | } 26 | 27 | func NewPodStatusCollector(cli kubernetes.Interface) (diagnosis.Diagnostic, error) { 28 | return &podStatusCollector{ 29 | client: cli, 30 | eventsChan: make(chan events.CollectorEvent), 31 | stop: make(chan struct{}), 32 | }, nil 33 | } 34 | 35 | func (p *podStatusCollector) onPodUpdate(oldPod, newPod *corev1.Pod) { 36 | if oldPod != nil { 37 | if reflect.DeepEqual(oldPod.Status.ContainerStatuses, newPod.Status.ContainerStatuses) { 38 | // no need to check 39 | return 40 | } 41 | } 42 | for _, cs := range newPod.Status.ContainerStatuses { 43 | if cs.State.Terminated != nil { 44 | if cs.State.Terminated.Reason == "Error" { 45 | p.eventsChan <- events.CollectorEvent{ 46 | TargetType: events.Pod, 47 | Namespace: newPod.Namespace, 48 | Name: newPod.Name, 49 | EventType: events.Error, 50 | Message: fmt.Sprintf("container %s terminated with error: %s, exit code: %d", cs.Name, cs.State.Terminated.Message, cs.State.Terminated.ExitCode), 51 | } 52 | } 53 | } 54 | } 55 | } 56 | 57 | func (p *podStatusCollector) Start() error { 58 | factory := informers.NewSharedInformerFactory(p.client, time.Minute) 59 | informer := factory.Core().V1().Pods().Informer() 60 | _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 61 | AddFunc: func(obj interface{}) { 62 | newPod := obj.(*corev1.Pod) 63 | if newPod.Labels[constants.EnabledRecoveryLabel] == "" { 64 | return 65 | } 66 | 67 | p.onPodUpdate(nil, newPod) 68 | }, 69 | UpdateFunc: func(oldObj, newObj interface{}) { 70 | newPod := newObj.(*corev1.Pod) 71 | oldPod := oldObj.(*corev1.Pod) 72 | if newPod.ResourceVersion == oldPod.ResourceVersion { 73 | return 74 | } 75 | if newPod.Labels[constants.EnabledRecoveryLabel] == "" { 76 | return 77 | } 78 | 79 | p.onPodUpdate(oldPod, newPod) 80 | }, 81 | }) 82 | if err != nil { 83 | return err 84 | } 85 | 86 | go informer.Run(p.stop) 87 | return nil 88 | } 89 | 90 | func (p *podStatusCollector) Stop() { 91 | close(p.stop) 92 | close(p.eventsChan) 93 | } 94 | 95 | func (p *podStatusCollector) Events() <-chan events.CollectorEvent { 96 | return p.eventsChan 97 | } 98 | -------------------------------------------------------------------------------- /pkg/events/events.go: -------------------------------------------------------------------------------- 1 | package events 2 | 3 | import "github.com/baizeai/kcover/pkg/runner" 4 | 5 | type TargetType string 6 | 7 | const ( 8 | Pod TargetType = "pod" 9 | Node TargetType = "node" 10 | Device TargetType = "device" 11 | ) 12 | 13 | type EventType int 14 | 15 | const ( 16 | _ EventType = iota 17 | Error 18 | Warning 19 | ) 20 | 21 | type CollectorEvent struct { 22 | TargetType 23 | Namespace string 24 | Name string 25 | EventType 26 | Message string 27 | } 28 | 29 | type Recorder interface { 30 | runner.Runner 31 | RecordEvent(e CollectorEvent) error 32 | EventChan() <-chan CollectorEvent 33 | } 34 | -------------------------------------------------------------------------------- /pkg/events/kubeevents.go: -------------------------------------------------------------------------------- 1 | package events 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "k8s.io/klog/v2" 9 | 10 | "k8s.io/apimachinery/pkg/runtime" 11 | "k8s.io/apimachinery/pkg/runtime/schema" 12 | "k8s.io/client-go/kubernetes/scheme" 13 | v1 "k8s.io/client-go/kubernetes/typed/core/v1" 14 | "k8s.io/client-go/tools/record" 15 | "k8s.io/client-go/tools/reference" 16 | 17 | "github.com/baizeai/kcover/pkg/constants" 18 | corev1 "k8s.io/api/core/v1" 19 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 | "k8s.io/client-go/informers" 21 | "k8s.io/client-go/kubernetes" 22 | "k8s.io/client-go/tools/cache" 23 | ) 24 | 25 | type kubeEventsRecorder struct { 26 | client kubernetes.Interface 27 | eventChan chan CollectorEvent 28 | stop chan struct{} 29 | watchEvent bool 30 | recorder record.EventRecorder 31 | } 32 | 33 | func NewKubeEventsRecorder(cli kubernetes.Interface, watchEvent bool) Recorder { 34 | eventBroadcaster := record.NewBroadcaster() 35 | eventBroadcaster.StartRecordingToSink(&v1.EventSinkImpl{ 36 | Interface: cli.CoreV1().Events(""), 37 | }) 38 | recorder := eventBroadcaster.NewRecorder(runtime.NewScheme(), corev1.EventSource{Component: "kcover"}) 39 | return &kubeEventsRecorder{ 40 | client: cli, 41 | eventChan: make(chan CollectorEvent), 42 | stop: make(chan struct{}), 43 | watchEvent: watchEvent, 44 | recorder: recorder, 45 | } 46 | } 47 | 48 | func (a *kubeEventsRecorder) Start() error { 49 | if !a.watchEvent { 50 | return nil 51 | } 52 | 53 | factory := informers.NewSharedInformerFactory(a.client, time.Minute) 54 | informer := factory.Core().V1().Events().Informer() 55 | 56 | _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 57 | AddFunc: func(obj interface{}) { 58 | event := obj.(*corev1.Event) 59 | eventTimestamp := event.LastTimestamp 60 | if eventTimestamp.IsZero() { 61 | eventTimestamp = event.CreationTimestamp 62 | } 63 | if eventTimestamp.Add(3 * time.Minute).Before(time.Now()) { 64 | klog.Infof("event %s is too old %s against %s, ignore it", event.Name, eventTimestamp.String(), time.Now().String()) 65 | return 66 | } 67 | if event.Annotations[constants.NeedRecoveryAnnotation] == "true" { 68 | obj := event.InvolvedObject 69 | switch obj.GroupVersionKind() { 70 | case schema.GroupVersionKind{ 71 | Group: "", 72 | Version: "v1", 73 | Kind: "Pod", 74 | }: 75 | a.eventChan <- CollectorEvent{ 76 | TargetType: Pod, 77 | Namespace: obj.Namespace, 78 | Name: obj.Name, 79 | EventType: Error, // todo change me 80 | Message: event.Message, 81 | } 82 | case schema.GroupVersionKind{ 83 | Group: "", 84 | Version: "v1", 85 | Kind: "Node", 86 | }: 87 | a.eventChan <- CollectorEvent{ 88 | TargetType: Node, 89 | Name: obj.Name, 90 | EventType: Error, // todo change me 91 | Message: event.Message, 92 | } 93 | } 94 | return 95 | } 96 | }, 97 | }) 98 | 99 | if err != nil { 100 | return err 101 | } 102 | go informer.Run(a.stop) 103 | 104 | return nil 105 | } 106 | 107 | func (a *kubeEventsRecorder) Stop() { 108 | close(a.stop) 109 | } 110 | 111 | func (a *kubeEventsRecorder) recordToPod(e CollectorEvent) error { 112 | pod, err := a.client.CoreV1().Pods(e.Namespace).Get(context.Background(), e.Name, metav1.GetOptions{}) 113 | if err != nil { 114 | return err 115 | } 116 | ref, err := reference.GetReference(scheme.Scheme, pod) 117 | if err != nil { 118 | return err 119 | } 120 | 121 | // 记录事件 122 | a.recorder.AnnotatedEventf(ref, map[string]string{ 123 | constants.NeedRecoveryAnnotation: "true", 124 | }, corev1.EventTypeWarning, "Error", e.Message) 125 | 126 | return nil 127 | } 128 | 129 | func (a *kubeEventsRecorder) recordToNode(e CollectorEvent) error { 130 | // patch pod with annotation 131 | node, err := a.client.CoreV1().Nodes().Get(context.Background(), e.Name, metav1.GetOptions{}) 132 | if err != nil { 133 | return err 134 | } 135 | ref, err := reference.GetReference(scheme.Scheme, node) 136 | if err != nil { 137 | return err 138 | } 139 | 140 | // 记录事件 141 | a.recorder.AnnotatedEventf(ref, map[string]string{ 142 | constants.NeedRecoveryAnnotation: "true", 143 | }, corev1.EventTypeWarning, "Error", e.Message) 144 | 145 | return nil 146 | } 147 | 148 | func (a *kubeEventsRecorder) RecordEvent(e CollectorEvent) error { 149 | var err error 150 | switch e.TargetType { 151 | case Pod: 152 | err = a.recordToPod(e) 153 | case Node: 154 | err = a.recordToNode(e) 155 | default: 156 | //TODO implement me 157 | return fmt.Errorf("unsupported target type: %s", e.TargetType) 158 | } 159 | return err 160 | } 161 | 162 | func (a *kubeEventsRecorder) EventChan() <-chan CollectorEvent { 163 | return a.eventChan 164 | } 165 | -------------------------------------------------------------------------------- /pkg/kube/kube.go: -------------------------------------------------------------------------------- 1 | package kube 2 | 3 | import ( 4 | "os" 5 | 6 | "k8s.io/client-go/rest" 7 | "k8s.io/client-go/tools/clientcmd" 8 | ) 9 | 10 | func GetK8sConfigConfigWithFile(kubeconfig, context string) *rest.Config { 11 | var config *rest.Config 12 | if kubeconfig == "" && context == "" { 13 | config, _ := rest.InClusterConfig() 14 | if config != nil { 15 | return config 16 | } 17 | } 18 | if kubeconfig != "" { 19 | info, err := os.Stat(kubeconfig) 20 | if err != nil || info.Size() == 0 { 21 | // If the specified kubeconfig doesn't exists / empty file / any other error 22 | // from file stat, fall back to default 23 | kubeconfig = "" 24 | } 25 | } 26 | 27 | // Config loading rules: 28 | // 1. kubeconfig if it not empty string 29 | // 2. In cluster config if running in-cluster 30 | // 3. Config(s) in KUBECONFIG environment variable 31 | // 4. Use $HOME/.kube/config 32 | loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() 33 | loadingRules.DefaultClientConfig = &clientcmd.DefaultClientConfig 34 | loadingRules.ExplicitPath = kubeconfig 35 | configOverrides := &clientcmd.ConfigOverrides{ 36 | ClusterDefaults: clientcmd.ClusterDefaults, 37 | CurrentContext: context, 38 | } 39 | 40 | config, _ = clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, configOverrides).ClientConfig() 41 | return config 42 | } 43 | -------------------------------------------------------------------------------- /pkg/recovery/job.go: -------------------------------------------------------------------------------- 1 | package recovery 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/jellydator/ttlcache/v3" 9 | corev1 "k8s.io/api/core/v1" 10 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 11 | "k8s.io/client-go/kubernetes" 12 | ) 13 | 14 | var ttlCache = ttlcache.New[string, map[string]string]() 15 | 16 | func getPodRelatedJobLabels(cli kubernetes.Interface, pod *corev1.Pod) (map[string]string, error) { 17 | if len(pod.OwnerReferences) < 1 { 18 | return nil, fmt.Errorf("pod %s/%s has no owner", pod.Namespace, pod.Name) 19 | } 20 | 21 | owner := pod.OwnerReferences[0] 22 | if v := ttlCache.Get(string(owner.UID)); v != nil { 23 | return v.Value(), nil 24 | } 25 | 26 | var resource string 27 | switch owner.Kind { 28 | case "PyTorchJob": 29 | resource = "pytorchjobs" 30 | case "TFJob": 31 | resource = "tfjobs" 32 | } 33 | 34 | un := unstructured.Unstructured{} 35 | err := cli.Discovery().RESTClient().Get(). 36 | AbsPath(fmt.Sprintf("/apis/%s/namespaces/%s/%s/%s", owner.APIVersion, pod.Namespace, resource, owner.Name)). 37 | Do(context.Background()).Into(&un) 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | ls := un.GetLabels() 43 | ttlCache.Set(string(owner.UID), ls, time.Second*30) 44 | 45 | return ls, nil 46 | } 47 | -------------------------------------------------------------------------------- /pkg/recovery/recovery.go: -------------------------------------------------------------------------------- 1 | package recovery 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/samber/lo" 9 | 10 | "github.com/baizeai/kcover/pkg/constants" 11 | "github.com/baizeai/kcover/pkg/events" 12 | "github.com/jellydator/ttlcache/v3" 13 | corev1 "k8s.io/api/core/v1" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | "k8s.io/client-go/kubernetes" 16 | "k8s.io/klog/v2" 17 | ) 18 | 19 | type RecoveryController struct { 20 | client kubernetes.Interface 21 | recorder events.Recorder 22 | stop chan struct{} 23 | restartDuration time.Duration 24 | restarts *ttlcache.Cache[string, time.Time] 25 | } 26 | 27 | func NewRecoveryController(cli kubernetes.Interface, recorder events.Recorder) *RecoveryController { 28 | return &RecoveryController{ 29 | client: cli, 30 | recorder: recorder, 31 | stop: make(chan struct{}), 32 | restartDuration: time.Second * 30, 33 | restarts: ttlcache.New[string, time.Time](), 34 | } 35 | } 36 | 37 | func (r *RecoveryController) onPodError(namespace, name string) { 38 | pod, err := r.client.CoreV1().Pods(namespace).Get(context.Background(), name, metav1.GetOptions{}) 39 | if err != nil { 40 | klog.Errorf("get pod %s/%s error events error: %v", namespace, name, err) 41 | return 42 | } 43 | if pod.Labels[constants.EnabledRecoveryLabel] != constants.True { 44 | ls, err := getPodRelatedJobLabels(r.client, pod) 45 | if err != nil { 46 | klog.Errorf("get pod %s/%s related job labels error: %v", namespace, name, err) 47 | return 48 | } 49 | if ls[constants.EnabledRecoveryLabel] != constants.True { 50 | klog.Infof("pod %s/%s or its owner job has no recovery label", namespace, name) 51 | return 52 | } 53 | } 54 | if jobLabel, ok := pod.Labels[constants.KubeflowJobLabel]; !ok { 55 | klog.Warningf("pod %s/%s has no job label", namespace, name) 56 | return 57 | } else { 58 | if pod.Spec.RestartPolicy == corev1.RestartPolicyNever { 59 | klog.Warningf("pod %s/%s has RestartPolicyNever, will not restart", namespace, name) 60 | return 61 | } 62 | key := fmt.Sprintf("%s/%s", namespace, jobLabel) 63 | tv := r.restarts.Get(key) 64 | if tv != nil { 65 | klog.Infof("job %s/%s has been restarted at %v, will not restart again in %v", namespace, jobLabel, tv.Value(), r.restartDuration) 66 | return 67 | } 68 | now := time.Now() 69 | r.restarts.Set(key, now, r.restartDuration) // only restart once in 60 seconds 70 | r.restartJob(context.Background(), namespace, jobLabel) 71 | go func() { 72 | <-time.After(r.restartDuration - time.Second) 73 | r.restarts.Delete(key) // 74 | }() 75 | } 76 | } 77 | 78 | func (r *RecoveryController) restartJob(ctx context.Context, namespace, name string) { 79 | err := r.client.CoreV1().Pods(namespace).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{ 80 | LabelSelector: fmt.Sprintf("%s=%s", constants.KubeflowJobLabel, name), 81 | }) 82 | if err != nil { 83 | klog.Errorf("restart job %s/%s error: %v", namespace, name, err) 84 | } else { 85 | klog.Infof("restart job %s/%s successfully", namespace, name) 86 | } 87 | } 88 | 89 | type nsName struct { 90 | ns string 91 | name string 92 | } 93 | 94 | func (r *RecoveryController) onNodeError(name string) { 95 | node, err := r.client.CoreV1().Nodes().Get(context.Background(), name, metav1.GetOptions{}) 96 | if err != nil { 97 | klog.Errorf("get node %s error: %v", name, err) 98 | return 99 | } 100 | if node.Spec.Unschedulable { 101 | klog.Infof("the node %s status has been set to unschedulable", name) 102 | return 103 | } 104 | // query jobs 105 | pods, err := r.client.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{ 106 | LabelSelector: constants.KubeflowJobLabel, 107 | FieldSelector: fmt.Sprintf("spec.nodeName=%s", name), 108 | }) 109 | if err != nil { 110 | klog.Errorf("fetch pods list for node %s error: %v", node, err) 111 | return 112 | } 113 | jobs := map[nsName]struct{}{} 114 | lo.ForEach(pods.Items, func(pod corev1.Pod, index int) { 115 | if jobLabel, ok := pod.Labels[constants.KubeflowJobLabel]; !ok { 116 | return 117 | } else { 118 | jobs[nsName{ 119 | ns: pod.Namespace, 120 | name: jobLabel, 121 | }] = struct{}{} 122 | } 123 | }) 124 | lo.ForEach(lo.Keys(jobs), func(item nsName, index int) { 125 | r.onPodError(item.ns, item.name) 126 | }) 127 | node.Spec.Unschedulable = true 128 | _, err = r.client.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{}) 129 | if err != nil { 130 | klog.Errorf("update node %s to unschedulable error: %v", name, err) 131 | } 132 | } 133 | 134 | func (r *RecoveryController) onEvent(e events.CollectorEvent) { 135 | klog.Infof("recover controller received event: %+v", e) 136 | switch e.TargetType { 137 | case events.Pod: 138 | if e.EventType == events.Error { 139 | r.onPodError(e.Namespace, e.Name) 140 | } 141 | case events.Node: 142 | r.onNodeError(e.Name) 143 | default: 144 | klog.Errorf("unsupported target type: %s", e.TargetType) 145 | } 146 | } 147 | 148 | func (r *RecoveryController) Start() error { 149 | if r.recorder == nil { 150 | return fmt.Errorf("recorder is nil") 151 | } 152 | go func() { 153 | for e := range r.recorder.EventChan() { 154 | r.onEvent(e) 155 | } 156 | }() 157 | return nil 158 | } 159 | 160 | func (r *RecoveryController) Stop() { 161 | close(r.stop) 162 | } 163 | -------------------------------------------------------------------------------- /pkg/runner/runner.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | type Runner interface { 4 | Start() error 5 | Stop() 6 | } 7 | --------------------------------------------------------------------------------