├── pkg ├── yarn │ ├── apis │ │ ├── VERSIONS │ │ ├── proto │ │ │ ├── hadoopcommon │ │ │ │ ├── RefreshCallQueueProtocol.proto │ │ │ │ ├── GetUserMappingsProtocol.proto │ │ │ │ ├── RefreshAuthorizationPolicyProtocol.proto │ │ │ │ ├── ZKFCProtocol.proto │ │ │ │ ├── IpcConnectionContext.proto │ │ │ │ ├── GenericRefreshProtocol.proto │ │ │ │ ├── RefreshUserMappingsProtocol.proto │ │ │ │ ├── TraceAdmin.proto │ │ │ │ ├── Security.proto │ │ │ │ ├── ProtocolInfo.proto │ │ │ │ ├── ProtobufRpcEngine.proto │ │ │ │ ├── FSProtos.proto │ │ │ │ ├── HAServiceProtocol.proto │ │ │ │ └── RpcHeader.proto │ │ │ └── hadoopyarn │ │ │ │ ├── server │ │ │ │ ├── resourcemanager_administration_protocol.proto │ │ │ │ └── yarn_server_resourcemanager_service_protos.proto │ │ │ │ └── applicationclient_protocol.proto │ │ ├── service │ │ │ ├── ha_service.go │ │ │ ├── applicationclient_service.go │ │ │ └── resourcemanager_administration_service.go │ │ ├── security │ │ │ ├── ugi.go │ │ │ └── digestmd5.go │ │ └── auth │ │ │ └── auth.go │ ├── cache │ │ ├── yarn_node.go │ │ └── nodes_syncer.go │ ├── client │ │ ├── examples │ │ │ ├── rm-get-cluster-nodes-with-ha │ │ │ │ └── main.go │ │ │ ├── ha-get-service-status │ │ │ │ └── main.go │ │ │ ├── rm-update-node-resource-with-ha │ │ │ │ └── main.go │ │ │ └── rm-update-node-resource │ │ │ │ └── main.go │ │ ├── ha_service_client.go │ │ ├── application_client.go │ │ ├── rm_admin_client.go │ │ ├── factory.go │ │ ├── mockclient │ │ │ ├── mock_factory.go │ │ │ └── mock_client.go │ │ └── client.go │ └── config │ │ ├── configuration.go │ │ └── yarn_configuration.go ├── copilot-agent │ ├── utils │ │ ├── cgroup_unsupported.go │ │ ├── cgroups_linux.go │ │ └── utils.go │ ├── nm │ │ ├── nm_pod_discover.go │ │ └── types.go │ ├── server │ │ ├── helper.go │ │ └── server.go │ └── runtime │ │ ├── manager.go │ │ └── runtime.go └── controller │ ├── noderesource │ ├── types.go │ ├── yarn_resource_test.go │ └── yarn_resource.go │ └── metrics │ ├── names.go │ └── yarn_collector.go ├── OWNERS ├── .licenseignore ├── .github ├── ISSUE_TEMPLATE │ ├── proposal.md │ ├── question.md │ ├── bug-report.md │ └── membership.yml ├── dependabot.yml ├── pull_request_template.md ├── workflows │ ├── license.yml │ ├── release.yaml │ └── ci.yaml └── stale.yml ├── docker ├── yarn-operator.dockerfile ├── yarn-copilot-agent.dockerfile └── hadoop-yarn.dockerfile ├── .gitignore ├── apis └── doc.go ├── hack ├── boilerplate │ └── boilerplate.go.txt ├── update-license-header.sh ├── mock-gen.sh └── generate-yarn.sh ├── codecov.yaml ├── SECURITY.md ├── .license └── dependency_decisions.yml ├── cmd ├── yarn-operator │ ├── options │ │ ├── controllers.go │ │ ├── scheme.go │ │ └── options.go │ └── main.go └── yarn-copilot-agent │ ├── options │ └── options.go │ └── main.go ├── config └── manager │ ├── role.yaml │ ├── configmap.yaml │ └── yarn-operator.yaml ├── embargo-policy.md ├── README-zh_CN.md └── README.md /pkg/yarn/apis/VERSIONS: -------------------------------------------------------------------------------- 1 | hadoop=release-3.2.1-RC0 2 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - zwzhang0107 3 | 4 | reviewers: 5 | - zwzhang0107 6 | -------------------------------------------------------------------------------- /.licenseignore: -------------------------------------------------------------------------------- 1 | vendor 2 | pkg/yarn/apis/auth 3 | pkg/yarn/apis/proto 4 | pkg/yarn/apis/security 5 | pkg/yarn/client/ipc 6 | pkg/yarn/config 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/proposal.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Proposal 3 | about: Suggest an idea for this project 4 | title: "[proposal]" 5 | labels: kind/proposal 6 | 7 | --- 8 | 9 | 10 | 11 | **What is your proposal**: 12 | 13 | 14 | **Why is this needed**: 15 | 16 | 17 | **Is there a suggested solution, if so, please add it**: 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | labels: 8 | - "dependencies" 9 | commit-message: 10 | prefix: "feat" 11 | include: "scope" 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "daily" 16 | labels: 17 | - "dependencies" 18 | commit-message: 19 | prefix: "chore" 20 | include: "scope" 21 | -------------------------------------------------------------------------------- /docker/yarn-operator.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.19 as builder 2 | WORKDIR /go/src/github.com/koordinator-sh/yarn-copilot 3 | 4 | COPY go.mod go.mod 5 | COPY go.sum go.sum 6 | 7 | RUN go mod download 8 | 9 | COPY cmd/ cmd/ 10 | COPY pkg/ pkg/ 11 | 12 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o koord-yarn-operator cmd/yarn-operator/main.go 13 | 14 | FROM alpine:3.16 15 | RUN apk add --update bash net-tools iproute2 logrotate less rsync util-linux lvm2 16 | WORKDIR / 17 | COPY --from=builder /go/src/github.com/koordinator-sh/yarn-copilot . 18 | ENTRYPOINT ["/koord-yarn-operator"] 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin 8 | 9 | # Test binary, built with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | 15 | # Dependency directories (remove the comment below to include it) 16 | vendor/ 17 | 18 | # Kubernetes Generated files - skip generated files, except for vendored files 19 | 20 | !vendor/**/zz_generated.* 21 | 22 | # editor and IDE paraphernalia 23 | .idea 24 | *.swp 25 | *.swo 26 | *~ 27 | 28 | .vscode 29 | .DS_Store 30 | 31 | dist/ 32 | -------------------------------------------------------------------------------- /docker/yarn-copilot-agent.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.17 as builder 2 | WORKDIR /go/src/github.com/koordinator-sh/yarn-copilot 3 | 4 | COPY go.mod go.mod 5 | COPY go.sum go.sum 6 | 7 | RUN go mod download 8 | 9 | COPY cmd/ cmd/ 10 | COPY pkg/ pkg/ 11 | 12 | RUN GOOS=linux GOARCH=amd64 go build -a -o koord-yarn-copilot cmd/yarn-copilot-agent/main.go 13 | 14 | FROM nvidia/cuda:11.2.2-base-ubuntu20.04 15 | RUN apt-get add --update bash net-tools iproute2 logrotate less rsync util-linux lvm2 16 | WORKDIR / 17 | COPY --from=builder /go/src/github.com/koordinator-sh/yarn-copilot . 18 | ENTRYPOINT ["/koord-yarn-copilot"] 19 | -------------------------------------------------------------------------------- /apis/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package apis 18 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /pkg/yarn/cache/yarn_node.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package cache 18 | 19 | type YarnNode struct { 20 | Name string 21 | Port int32 22 | ClusterID string 23 | } 24 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Ⅰ. Describe what this PR does 2 | 3 | 9 | 10 | ### Ⅱ. Does this pull request fix one issue? 11 | 12 | 13 | 14 | ### Ⅲ. Describe how to verify it 15 | 16 | ### Ⅳ. Special notes for reviews 17 | 18 | ### V. Checklist 19 | 20 | - [ ] I have written necessary docs and comments 21 | - [ ] I have added necessary unit tests and integration tests 22 | - [ ] All checks passed in `make test` 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Support request or question relating to Koordinator 4 | title: "[question]" 5 | labels: kind/question 6 | 7 | --- 8 | 9 | 10 | 11 | **What happened:** 12 | 13 | **What you expected to happen:** 14 | 15 | **Environment:** 16 | 17 | 18 | 19 | - Koordinator version: - v0.6.2 20 | - Kubernetes version (use kubectl version): v1.22.5 21 | - docker/containerd version: containerd 1.5.0 22 | - OS (e.g: cat /etc/os-release): Ubuntu 20.04.4 LTS 23 | - Kernel (e.g. uname -a): Linux 5.10.112-11.al8.x86_64 #1 SMP Tue May 24 16:05:50 CST 2022 x86_64 x86_64 x86_64 GNU/Linux 24 | 25 | **Anything else we need to know:** 26 | -------------------------------------------------------------------------------- /codecov.yaml: -------------------------------------------------------------------------------- 1 | # https://docs.codecov.com/docs/commit-status 2 | coverage: 3 | status: 4 | project: 5 | # global coverage 6 | default: 7 | target: auto 8 | threshold: 2% 9 | flags: 10 | - unittests 11 | paths: 12 | - "pkg" 13 | if_ci_failed: error 14 | if_no_uploads: error 15 | if_not_found: success 16 | patch: 17 | # diff coverage 18 | default: 19 | target: 70% 20 | flags: 21 | - unittests 22 | paths: 23 | - "pkg" 24 | informational: true 25 | if_ci_failed: ignore 26 | if_no_uploads: success 27 | if_not_found: success 28 | 29 | ignore: 30 | - "apis" 31 | - "yarn/client" 32 | - "yarn/apis" 33 | - "**/*_generated.*" 34 | 35 | github_checks: 36 | annotations: true 37 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | - [Security Policy](#security-policy) 4 | - [Reporting security problems](#reporting-security-problems) 5 | - [Vulnerability Management Plans](#vulnerability-management-plans) 6 | - [Critical Updates And Security Notices](#critical-updates-and-security-notices) 7 | 8 | ## Reporting security problems 9 | 10 | **DO NOT CREATE AN ISSUE** to report a security problem. Instead, please 11 | send an email to kubernetes-security@service.aliyun.com 12 | 13 | Please follow the [embargo policy](./embargo-policy.md) for all security-related problems. 14 | 15 | ## Vulnerability Management Plans 16 | 17 | ### Critical Updates And Security Notices 18 | 19 | We learn about critical software updates and security threats from these sources 20 | 21 | 1. GitHub Security Alerts 22 | 2. [Dependabot](https://dependabot.com/) Dependency Updates 23 | -------------------------------------------------------------------------------- /pkg/copilot-agent/utils/cgroup_unsupported.go: -------------------------------------------------------------------------------- 1 | //go:build !linux 2 | // +build !linux 3 | 4 | /* 5 | Copyright 2022 The Koordinator Authors. 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | */ 19 | 20 | package utils 21 | 22 | import "fmt" 23 | 24 | func GetPids(cgroupPath string) ([]int, error) { 25 | return nil, fmt.Errorf("unsupported") 26 | } 27 | -------------------------------------------------------------------------------- /pkg/controller/noderesource/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package noderesource 18 | 19 | const ( 20 | YarnNMComponentLabel = "app.kubernetes.io/component" 21 | YarnNMComponentValue = "node-manager" 22 | YarnNodeIdAnnotation = "yarn.hadoop.apache.org/node-id" 23 | ) 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: kind/bug 6 | 7 | --- 8 | 9 | 10 | 11 | **What happened**: 12 | 13 | **What you expected to happen**: 14 | 15 | **How to reproduce it (as minimally and precisely as possible)**: 16 | 17 | **Anything else we need to know?**: 18 | 19 | **Environment**: 20 | - App version: 21 | - Kubernetes version (use `kubectl version`): 22 | - Install details (e.g. helm install args): 23 | - Node environment (for koordlet/runtime-proxy issue): 24 | - Containerd/Docker version: 25 | - OS version: 26 | - Kernal version: 27 | - Cgroup driver: cgroupfs/systemd 28 | - Others: 29 | 30 | -------------------------------------------------------------------------------- /pkg/copilot-agent/utils/cgroups_linux.go: -------------------------------------------------------------------------------- 1 | //go:build linux 2 | // +build linux 3 | 4 | /* 5 | Copyright 2022 The Koordinator Authors. 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | */ 19 | 20 | package utils 21 | 22 | import "github.com/opencontainers/runc/libcontainer/cgroups" 23 | 24 | func GetPids(cgroupPath string) ([]int, error) { 25 | return cgroups.GetPids(cgroupPath) 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/license.yml: -------------------------------------------------------------------------------- 1 | name: License 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - release-* 8 | pull_request: {} 9 | workflow_dispatch: {} 10 | 11 | jobs: 12 | license-check: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: ruby/setup-ruby@v1 17 | with: 18 | ruby-version: 2.6 19 | - name: Check license 20 | run: | 21 | gem install license_finder 22 | license_finder --decisions_file .license/dependency_decisions.yml 23 | - uses: actions/setup-go@v4 24 | with: 25 | cache: false 26 | go-version-file: go.mod 27 | - name: Check license header 28 | run: | 29 | make lint-license && git add apis pkg cmd || exit 1 30 | git diff --cached --exit-code || (echo 'Please run "make lint-license" to verify license header' && exit 1); 31 | -------------------------------------------------------------------------------- /pkg/controller/metrics/names.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | const ( 20 | yarnNodeCPUResource = "yarn_node_cpu_resource" 21 | yarnNodeMemoryResource = "yarn_node_memory_resource" 22 | yarnNodeCPUAllocatedResource = "yarn_node_cpu_allocated_resource" 23 | yarnNodeMemoryAllocatedResource = "yarn_node_memory_allocated_resource" 24 | ) 25 | -------------------------------------------------------------------------------- /.license/dependency_decisions.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - - :permit 3 | - MIT 4 | - :who: 5 | :why: 6 | :versions: [] 7 | :when: 2021-03-12 07:35:34.645031000 Z 8 | - - :permit 9 | - Apache 2.0 10 | - :who: 11 | :why: 12 | :versions: [] 13 | :when: 2021-03-12 07:19:18.243194000 Z 14 | - - :permit 15 | - New BSD 16 | - :who: 17 | :why: 18 | :versions: [] 19 | :when: 2021-03-12 07:19:28.540675000 Z 20 | - - :permit 21 | - Simplified BSD 22 | - :who: 23 | :why: 24 | :versions: [] 25 | :when: 2021-03-12 07:20:01.774212000 Z 26 | - - :permit 27 | - Mozilla Public License 2.0 28 | - :who: 29 | :why: 30 | :versions: [] 31 | :when: 2021-03-12 07:21:05.194536000 Z 32 | - - :permit 33 | - unknown 34 | - :who: 35 | :why: 36 | :versions: [] 37 | :when: 2021-03-12 07:21:43.379269000 Z 38 | - - :permit 39 | - ISC 40 | - :who: 41 | :why: 42 | :versions: [] 43 | :when: 2021-03-12 07:22:07.265966000 Z 44 | -------------------------------------------------------------------------------- /cmd/yarn-operator/options/controllers.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package options 18 | 19 | import ( 20 | "sigs.k8s.io/controller-runtime/pkg/manager" 21 | 22 | yarnnoderes "github.com/koordinator-sh/yarn-copilot/pkg/controller/noderesource" 23 | ) 24 | 25 | var controllerAddFuncs = map[string]func(manager.Manager) error{ 26 | yarnnoderes.Name: yarnnoderes.Add, 27 | } 28 | 29 | var controllerAddDefault = []string{ 30 | yarnnoderes.Name, 31 | } 32 | -------------------------------------------------------------------------------- /hack/update-license-header.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2022 The Koordinator Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | PROJECT=$(cd $(dirname $0)/..; pwd) 19 | 20 | LICENSEHEADERCHECKER_VERSION=v1.4.0 21 | 22 | GOBIN=${PROJECT}/bin go install github.com/lluissm/license-header-checker/cmd/license-header-checker@${LICENSEHEADERCHECKER_VERSION} 23 | 24 | LICENSEIGNORE=$(cat ${PROJECT}/.licenseignore | tr '\n' ',') 25 | 26 | ${PROJECT}/bin/license-header-checker -r -a -v -i ${LICENSEIGNORE} ${PROJECT}/hack/boilerplate/boilerplate.go.txt . go 27 | -------------------------------------------------------------------------------- /pkg/copilot-agent/utils/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | func DiffMap(m1, m2 map[string]struct{}) (m1More, m1Less map[string]struct{}) { 20 | m1More = map[string]struct{}{} 21 | m1Less = map[string]struct{}{} 22 | for k1 := range m1 { 23 | if _, ok := m2[k1]; !ok { 24 | m1More[k1] = struct{}{} // key 仅在第一个 map 中存在 25 | } 26 | } 27 | 28 | for k2 := range m2 { 29 | if _, ok := m1[k2]; !ok { 30 | m1Less[k2] = struct{}{} // key 仅在第二个 map 中存在 31 | } 32 | } 33 | 34 | return 35 | } 36 | -------------------------------------------------------------------------------- /cmd/yarn-operator/options/scheme.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package options 18 | 19 | import ( 20 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 21 | "k8s.io/apimachinery/pkg/runtime" 22 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 23 | ) 24 | 25 | var Scheme = runtime.NewScheme() 26 | 27 | func init() { 28 | _ = clientgoscheme.AddToScheme(Scheme) 29 | 30 | Scheme.AddUnversionedTypes(metav1.SchemeGroupVersion, &metav1.UpdateOptions{}, &metav1.DeleteOptions{}, &metav1.CreateOptions{}) 31 | // +kubebuilder:scaffold:scheme 32 | } 33 | -------------------------------------------------------------------------------- /hack/mock-gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2022-2023 The Koordinator Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | set -e 19 | 20 | SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) 21 | LICENSE_HEADER_PATH="./hack/boilerplate/boilerplate.go.txt" 22 | 23 | cd $GOPATH/src/github.com/koordinator-sh/yarn-copilot 24 | 25 | # generates gomock files 26 | mockgen -source pkg/yarn/client/factory.go \ 27 | -destination pkg/yarn/client/mockclient/mock_factory.go \ 28 | -copyright_file ${LICENSE_HEADER_PATH} 29 | mockgen -source pkg/yarn/client/client.go \ 30 | -destination pkg/yarn/client/mockclient/mock_client.go \ 31 | -copyright_file ${LICENSE_HEADER_PATH} 32 | -------------------------------------------------------------------------------- /config/manager/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | creationTimestamp: null 6 | name: koord-yarn-operator 7 | rules: 8 | - apiGroups: 9 | - "" 10 | resources: 11 | - configmaps 12 | verbs: 13 | - get 14 | - list 15 | - watch 16 | - create 17 | - update 18 | - apiGroups: 19 | - "" 20 | resources: 21 | - events 22 | verbs: 23 | - get 24 | - list 25 | - watch 26 | - create 27 | - update 28 | - apiGroups: 29 | - "" 30 | resources: 31 | - nodes 32 | verbs: 33 | - get 34 | - list 35 | - watch 36 | - apiGroups: 37 | - "" 38 | resources: 39 | - pods 40 | verbs: 41 | - get 42 | - list 43 | - watch 44 | --- 45 | apiVersion: v1 46 | kind: ServiceAccount 47 | metadata: 48 | namespace: koordinator-system 49 | name: koord-yarn-operator 50 | --- 51 | apiVersion: rbac.authorization.k8s.io/v1 52 | kind: ClusterRoleBinding 53 | metadata: 54 | name: koord-yarn-operator-rolebinding 55 | roleRef: 56 | apiGroup: rbac.authorization.k8s.io 57 | kind: ClusterRole 58 | name: koord-yarn-operator 59 | subjects: 60 | - kind: ServiceAccount 61 | name: koord-yarn-operator 62 | namespace: koordinator-system -------------------------------------------------------------------------------- /pkg/yarn/client/examples/rm-get-cluster-nodes-with-ha/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "log" 21 | 22 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 23 | yarnclient "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 24 | ) 25 | 26 | func main() { 27 | // Create yarnClient 28 | yarnClient, _ := yarnclient.DefaultYarnClientFactory.CreateDefaultYarnClient() 29 | 30 | request := &hadoopyarn.GetClusterNodesRequestProto{ 31 | NodeStates: []hadoopyarn.NodeStateProto{}, 32 | } 33 | response, err := yarnClient.GetClusterNodes(request) 34 | 35 | if err != nil { 36 | log.Fatal("GetClusterNode ", err) 37 | } 38 | 39 | log.Printf("GetClusterNode response %v", response) 40 | } 41 | -------------------------------------------------------------------------------- /embargo-policy.md: -------------------------------------------------------------------------------- 1 | # Embargo Policy 2 | 3 | This policy forbids members of this project's security contacts any others 4 | defined below from sharing information outside of the security contacts and this 5 | listing without need-to-know and advance notice. 6 | 7 | The information members and others receive from the list defined below must: 8 | 9 | * not be made public, 10 | * not be shared, 11 | * not be hinted at 12 | * must be kept confidential and close held 13 | 14 | Except with the list's explicit approval. This holds true until the public 15 | disclosure date/time that was agreed upon by the list. 16 | 17 | If information is inadvertently shared beyond what is allowed by this policy, 18 | you are REQUIRED to inform the security contacts kubernetes-security@service.aliyun.com of exactly what 19 | information leaked and to whom. A retrospective will take place after the leak 20 | so we can assess how to not make this mistake in the future. 21 | 22 | Violation of this policy will result in the immediate removal and subsequent 23 | replacement of you from this list or the Security Contacts. 24 | 25 | ## Disclosure Timeline 26 | 27 | This project sustains a **disclosure timeline** to ensure we provide a 28 | quality, tested release. On some occasions, we may need to extend this timeline 29 | due to complexity of the problem, lack of expertise available, or other reasons. 30 | Submitters will be notified if an extension occurs. 31 | -------------------------------------------------------------------------------- /pkg/yarn/client/ha_service_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 21 | yarnservice "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/service" 22 | ) 23 | 24 | type YarnHAClient struct { 25 | client yarnservice.HAServiceProtocolService 26 | } 27 | 28 | func CreateYarnHAClient(rmAddress string) (*YarnHAClient, error) { 29 | c, err := yarnservice.DialHAServiceProtocolService(rmAddress) 30 | return &YarnHAClient{client: c}, err 31 | } 32 | 33 | func (c *YarnHAClient) GetServiceStatus(request *hadoopcommon.GetServiceStatusRequestProto) (*hadoopcommon.GetServiceStatusResponseProto, error) { 34 | response := &hadoopcommon.GetServiceStatusResponseProto{} 35 | err := c.client.GetServiceStatus(request, response) 36 | if err != nil { 37 | return nil, err 38 | } 39 | return response, nil 40 | } 41 | -------------------------------------------------------------------------------- /pkg/yarn/client/application_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 21 | yarnservice "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/service" 22 | yarnconf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 23 | ) 24 | 25 | type YarnApplicationClient struct { 26 | client yarnservice.ApplicationClientProtocolService 27 | } 28 | 29 | func CreateYarnApplicationClient(conf yarnconf.YarnConfiguration, rmAddress *string) (*YarnApplicationClient, error) { 30 | c, err := yarnservice.DialApplicationClientProtocolService(conf, rmAddress) 31 | return &YarnApplicationClient{client: c}, err 32 | } 33 | 34 | func (c *YarnApplicationClient) GetClusterNode(request *hadoopyarn.GetClusterNodesRequestProto) (*hadoopyarn.GetClusterNodesResponseProto, error) { 35 | response := &hadoopyarn.GetClusterNodesResponseProto{} 36 | err := c.client.GetClusterNodes(request, response) 37 | if err != nil { 38 | return response, err 39 | } 40 | return response, nil 41 | } 42 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 90 3 | 4 | # Number of days of inactivity before a stale issue is closed 5 | daysUntilClose: 30 6 | 7 | # Issues with these labels will never be considered stale 8 | exemptLabels: 9 | - pinned 10 | - security 11 | 12 | # Label to use when marking an issue as stale 13 | staleLabel: lifecycle/stale 14 | 15 | # Comment to post when marking an issue as stale. Set to `false` to disable 16 | markComment: | 17 | This issue has been automatically marked as stale because it has not had recent activity. 18 | This bot triages issues and PRs according to the following rules: 19 | - After 90d of inactivity, `lifecycle/stale` is applied 20 | - After 30d of inactivity since `lifecycle/stale` was applied, the issue is closed 21 | You can: 22 | - Mark this issue or PR as fresh with `/remove-lifecycle stale` 23 | - Close this issue or PR with `/close` 24 | Thank you for your contributions. 25 | 26 | # Comment to post when closing a stale issue. Set to `false` to disable 27 | closeComment: | 28 | This issue has been automatically closed because it has not had recent activity. 29 | This bot triages issues and PRs according to the following rules: 30 | - After 90d of inactivity, `lifecycle/stale` is applied 31 | - After 30d of inactivity since `lifecycle/stale` was applied, the issue is closed 32 | You can: 33 | - Reopen this PR with `/reopen` 34 | Thank you for your contributions. 35 | 36 | # Set to true to ignore issues in a project (defaults to false) 37 | exemptProjects: true 38 | 39 | # Set to true to ignore issues in a milestone (defaults to false) 40 | exemptMilestones: true 41 | -------------------------------------------------------------------------------- /pkg/yarn/client/rm_admin_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | yarnserver "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 21 | yarnservice "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/service" 22 | yarnconf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 23 | ) 24 | 25 | type YarnAdminClient struct { 26 | client yarnservice.ResourceManagerAdministrationProtocolService 27 | } 28 | 29 | func CreateYarnAdminClient(conf yarnconf.YarnConfiguration, rmAddress *string) (*YarnAdminClient, error) { 30 | c, err := yarnservice.DialResourceManagerAdministrationProtocolService(conf, rmAddress) 31 | return &YarnAdminClient{client: c}, err 32 | } 33 | 34 | func (c *YarnAdminClient) UpdateNodeResource(request *yarnserver.UpdateNodeResourceRequestProto) (*yarnserver.UpdateNodeResourceResponseProto, error) { 35 | response := &yarnserver.UpdateNodeResourceResponseProto{} 36 | err := c.client.UpdateNodeResource(request, response) 37 | if err != nil { 38 | return nil, err 39 | } 40 | return response, nil 41 | } 42 | -------------------------------------------------------------------------------- /pkg/copilot-agent/nm/nm_pod_discover.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package nm 18 | 19 | import ( 20 | "fmt" 21 | 22 | statesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/impl" 23 | ) 24 | 25 | const ( 26 | ComponentLabelKey = "app.kubernetes.io/component" 27 | NodeManagerComponentLabelName = "node-manager" 28 | ) 29 | 30 | type NMPodWatcher struct { 31 | kubeletstub statesinformer.KubeletStub 32 | } 33 | 34 | func NewNMPodWater(kubeletstub statesinformer.KubeletStub) *NMPodWatcher { 35 | return &NMPodWatcher{kubeletstub: kubeletstub} 36 | } 37 | 38 | func (n *NMPodWatcher) GetNMPodEndpoint() (string, bool, error) { 39 | pods, err := n.kubeletstub.GetAllPods() 40 | if err != nil { 41 | return "", false, err 42 | } 43 | for _, pod := range pods.Items { 44 | if pod.Labels[ComponentLabelKey] != NodeManagerComponentLabelName { 45 | continue 46 | } 47 | if pod.Spec.HostNetwork { 48 | return "localhost:8042", true, nil 49 | } 50 | return fmt.Sprintf("%s:8042", pod.Status.PodIP), true, nil 51 | } 52 | return "", false, nil 53 | } 54 | -------------------------------------------------------------------------------- /cmd/yarn-copilot-agent/options/options.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package options 18 | 19 | import "time" 20 | 21 | const ( 22 | DefaultServerEndpoint = "/var/run/yarn-copilot/yarn-copilot.sock" 23 | DefaultYarnContainerCgroupPath = "kubepods/besteffort/hadoop-yarn" 24 | DefaultSyncCgroupPeriod = time.Second * 10 25 | DefaultNodeManagerEndpoint = "localhost:8042" 26 | DefaultCgroupRootDir = "/sys/fs/cgroup/" 27 | ) 28 | 29 | type Configuration struct { 30 | ServerEndpoint string 31 | YarnContainerCgroupPath string 32 | SyncMemoryCgroup bool 33 | SyncCgroupPeriod time.Duration 34 | NodeMangerEndpoint string 35 | CgroupRootDir string 36 | } 37 | 38 | func NewConfiguration() *Configuration { 39 | return &Configuration{ 40 | ServerEndpoint: DefaultServerEndpoint, 41 | YarnContainerCgroupPath: DefaultYarnContainerCgroupPath, 42 | SyncMemoryCgroup: false, 43 | SyncCgroupPeriod: DefaultSyncCgroupPeriod, 44 | NodeMangerEndpoint: DefaultNodeManagerEndpoint, 45 | CgroupRootDir: DefaultCgroupRootDir, 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /docker/hadoop-yarn.dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.14 as BUILDER 2 | 3 | ENV HADOOP_VERSION 3.3.3 4 | ENV SPARK_VERSION 3.3.3 5 | 6 | RUN apk update \ 7 | && apk --update add curl \ 8 | && rm -rf /var/cache/apk/* /tmp/* /var/tmp/* $HOME/.cache 9 | RUN curl -s -o /tmp/hadoop.tgz https://mirrors.aliyun.com/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \ 10 | && tar --directory /opt -xzf /tmp/hadoop.tgz 11 | RUN curl -s -o /tmp/spark.tgz https://mirrors.aliyun.com/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 12 | && tar --directory /opt -xzf /tmp/spark.tgz 13 | 14 | 15 | FROM openjdk:8 16 | 17 | ENV HADOOP_VERSION 3.3.3 18 | ENV SPARK_VERSION 3.3.3 19 | ENV SPARK_HOME=/opt/spark 20 | ENV HADOOP_HOME=/opt/hadoop 21 | 22 | ENV HADOOP_COMMON_HOME=${HADOOP_HOME} \ 23 | HADOOP_HDFS_HOME=${HADOOP_HOME} \ 24 | HADOOP_MAPRED_HOME=${HADOOP_HOME} \ 25 | HADOOP_YARN_HOME=${HADOOP_HOME} \ 26 | HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop \ 27 | PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin 28 | 29 | COPY --from=BUILDER /opt/hadoop-${HADOOP_VERSION} ${HADOOP_HOME} 30 | COPY --from=BUILDER /opt/spark-${SPARK_VERSION}-bin-hadoop3 ${SPARK_HOME} 31 | 32 | RUN apt-get update && apt-get install -y apt-transport-https 33 | RUN curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add - 34 | RUN echo 'deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main' >> /etc/apt/sources.list.d/kubernetes.list 35 | #RUN cat </etc/apt/sources.list.d/kubernetes.list \ 36 | # deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main \ 37 | # EOF 38 | 39 | RUN apt-get update 40 | RUN apt-get install -y kubectl dnsutils 41 | 42 | WORKDIR $HADOOP_HOME -------------------------------------------------------------------------------- /config/manager/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: yarn-config 5 | namespace: koordinator-system 6 | data: 7 | yarn-site.xml: | 8 | 9 | 10 | yarn.resourcemanager.admin.address 11 | 0.0.0.0:8033 12 | 13 | 14 | yarn.resourcemanager.address 15 | 0.0.0.0:8032 16 | 17 | 18 | yarn.resourcemanager.ha.enabled 19 | true 20 | 21 | 22 | yarn.resourcemanager.ha.rm-ids 23 | rm1,rm2,rm3 24 | 25 | 26 | yarn.resourcemanager.admin.address.rm1 27 | 0.0.0.0:8033 28 | 29 | 30 | yarn.resourcemanager.admin.address.rm2 31 | 0.0.0.0:8033 32 | 33 | 34 | yarn.resourcemanager.admin.address.rm3 35 | 0.0.0.0:8033 36 | 37 | 38 | yarn.resourcemanager.address.rm1 39 | 0.0.0.0:8032 40 | 41 | 42 | yarn.resourcemanager.address.rm2 43 | 0.0.0.0:8032 44 | 45 | 46 | yarn.resourcemanager.address.rm3 47 | 0.0.0.0:8032 48 | 49 | core-site.xml: | 50 | 51 | -------------------------------------------------------------------------------- /pkg/yarn/client/examples/ha-get-service-status/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "log" 21 | "os" 22 | 23 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 24 | yarnclient "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 25 | yarnconf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 26 | ) 27 | 28 | func main() { 29 | // Create YarnConfiguration 30 | conf, err := yarnconf.NewYarnConfiguration(os.Getenv("HADOOP_CONF_DIR"), "") 31 | if err != nil { 32 | log.Fatal("new yarn conf", err) 33 | } 34 | 35 | rmIDs, err := conf.GetRMs() 36 | if err != nil { 37 | log.Fatal("get rms from conf", err) 38 | } 39 | 40 | for _, rmID := range rmIDs { 41 | rmAddr, err := conf.GetRMAdminAddressByID(rmID) 42 | if err != nil { 43 | log.Fatal("GetRMAdminAddressByID", err) 44 | } 45 | 46 | // Create YarnAdminClient 47 | yarnHAClient, _ := yarnclient.CreateYarnHAClient(rmAddr) 48 | 49 | request := &hadoopcommon.GetServiceStatusRequestProto{} 50 | response, err := yarnHAClient.GetServiceStatus(request) 51 | 52 | if err != nil { 53 | log.Fatal("yarnHAClient.GetServiceStatus ", err) 54 | } 55 | 56 | log.Printf("GetServiceStatus response %v", response) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/RefreshCallQueueProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ipc.proto"; 27 | option java_outer_classname = "RefreshCallQueueProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Refresh callqueue request. 34 | */ 35 | message RefreshCallQueueRequestProto { 36 | } 37 | 38 | /** 39 | * void response. 40 | */ 41 | message RefreshCallQueueResponseProto { 42 | } 43 | 44 | /** 45 | * Protocol which is used to refresh the callqueue. 46 | */ 47 | service RefreshCallQueueProtocolService { 48 | /** 49 | * Refresh the callqueue. 50 | */ 51 | rpc refreshCallQueue(RefreshCallQueueRequestProto) 52 | returns(RefreshCallQueueResponseProto); 53 | } 54 | -------------------------------------------------------------------------------- /pkg/yarn/client/examples/rm-update-node-resource-with-ha/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "log" 21 | 22 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 23 | yarnserver "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 24 | yarnclient "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 25 | ) 26 | 27 | func main() { 28 | // Create yarnClient 29 | yarnClient, _ := yarnclient.DefaultYarnClientFactory.CreateDefaultYarnClient() 30 | 31 | host := "0.0.0.0" 32 | port := int32(8041) 33 | vCores := int32(101) 34 | memoryMB := int64(10240) 35 | request := &yarnserver.UpdateNodeResourceRequestProto{ 36 | NodeResourceMap: []*hadoopyarn.NodeResourceMapProto{ 37 | { 38 | NodeId: &hadoopyarn.NodeIdProto{ 39 | Host: &host, 40 | Port: &port, 41 | }, 42 | ResourceOption: &hadoopyarn.ResourceOptionProto{ 43 | Resource: &hadoopyarn.ResourceProto{ 44 | Memory: &memoryMB, 45 | VirtualCores: &vCores, 46 | }, 47 | }, 48 | }, 49 | }, 50 | } 51 | response, err := yarnClient.UpdateNodeResource(request) 52 | 53 | if err != nil { 54 | log.Fatal("yarnClient.UpdateNodeResource ", err) 55 | } 56 | 57 | log.Printf("UpdateNodeResource response %v", response) 58 | } 59 | -------------------------------------------------------------------------------- /pkg/copilot-agent/server/helper.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package server 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | "k8s.io/apimachinery/pkg/api/resource" 22 | 23 | "github.com/koordinator-sh/yarn-copilot/pkg/copilot-agent/nm" 24 | ) 25 | 26 | func ParseContainerInfo(yarnContainer *nm.YarnContainer, op *nm.NodeMangerOperator) *ContainerInfo { 27 | return &ContainerInfo{ 28 | Name: yarnContainer.Id, 29 | Namespace: "yarn", 30 | UID: yarnContainer.Id, 31 | CgroupDir: op.GenerateCgroupPath(yarnContainer.Id), 32 | HostNetwork: true, 33 | Priority: 1, 34 | Resources: corev1.ResourceRequirements{ 35 | Limits: map[corev1.ResourceName]resource.Quantity{ 36 | corev1.ResourceCPU: *resource.NewMilliQuantity(int64(yarnContainer.TotalVCoresNeeded*1000), resource.DecimalSI), 37 | corev1.ResourceMemory: *resource.NewMilliQuantity(int64(yarnContainer.TotalMemoryNeededMB*1024*1024*1000), resource.DecimalSI), 38 | }, 39 | Requests: map[corev1.ResourceName]resource.Quantity{ 40 | corev1.ResourceCPU: *resource.NewMilliQuantity(int64(yarnContainer.TotalVCoresNeeded*1000), resource.DecimalSI), 41 | corev1.ResourceMemory: *resource.NewMilliQuantity(int64(yarnContainer.TotalMemoryNeededMB*1024*1024*1000), resource.DecimalSI), 42 | }, 43 | }, 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /pkg/copilot-agent/nm/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package nm 18 | 19 | var ( 20 | FinalContainerStates = []string{"KILLING", "DONE", "LOCALIZATION_FAILED", "CONTAINER_RESOURCES_CLEANINGUP", 21 | "CONTAINER_CLEANEDUP_AFTER_KILL", "EXITED_WITH_FAILURE", "EXITED_WITH_SUCCESS"} 22 | ) 23 | 24 | type YarnContainer struct { 25 | Id string `json:"id"` 26 | Appid string `json:"appid"` 27 | State string `json:"state"` 28 | ExitCode int `json:"exitCode"` 29 | Diagnostics string `json:"diagnostics"` 30 | User string `json:"user"` 31 | TotalMemoryNeededMB int `json:"totalMemoryNeededMB"` 32 | TotalVCoresNeeded int `json:"totalVCoresNeeded"` 33 | ContainerLogsLink string `json:"containerLogsLink"` 34 | NodeId string `json:"nodeId"` 35 | MemUsed float64 `json:"memUsed"` 36 | MemMaxed float64 `json:"memMaxed"` 37 | CpuUsed float64 `json:"cpuUsed"` 38 | CpuMaxed float64 `json:"cpuMaxed"` 39 | ContainerLogFiles []string `json:"containerLogFiles"` 40 | } 41 | 42 | func (c *YarnContainer) IsFinalState() bool { 43 | for _, state := range FinalContainerStates { 44 | if c.State == state { 45 | return true 46 | } 47 | } 48 | return false 49 | } 50 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/GetUserMappingsProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.tools.proto"; 27 | option java_outer_classname = "GetUserMappingsProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Get groups for user request. 34 | */ 35 | message GetGroupsForUserRequestProto { 36 | required string user = 1; 37 | } 38 | 39 | /** 40 | * Response for get groups. 41 | */ 42 | message GetGroupsForUserResponseProto { 43 | repeated string groups = 1; 44 | } 45 | 46 | 47 | /** 48 | * Protocol which maps users to groups. 49 | */ 50 | service GetUserMappingsProtocolService { 51 | /** 52 | * Get the groups which are mapped to the given user. 53 | */ 54 | rpc getGroupsForUser(GetGroupsForUserRequestProto) 55 | returns(GetGroupsForUserResponseProto); 56 | } 57 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/RefreshAuthorizationPolicyProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.security.proto"; 27 | option java_outer_classname = "RefreshAuthorizationPolicyProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Refresh service acl request. 34 | */ 35 | message RefreshServiceAclRequestProto { 36 | } 37 | 38 | /** 39 | * void response 40 | */ 41 | message RefreshServiceAclResponseProto { 42 | } 43 | 44 | /** 45 | * Protocol which is used to refresh the authorization policy in use currently. 46 | */ 47 | service RefreshAuthorizationPolicyProtocolService { 48 | /** 49 | * Refresh the service-level authorization policy in-effect. 50 | */ 51 | rpc refreshServiceAcl(RefreshServiceAclRequestProto) 52 | returns(RefreshServiceAclResponseProto); 53 | } 54 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/ZKFCProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ha.proto"; 27 | option java_outer_classname = "ZKFCProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | message CedeActiveRequestProto { 33 | required uint32 millisToCede = 1; 34 | } 35 | 36 | message CedeActiveResponseProto { 37 | } 38 | 39 | message GracefulFailoverRequestProto { 40 | } 41 | 42 | message GracefulFailoverResponseProto { 43 | } 44 | 45 | 46 | /** 47 | * Protocol provides manual control of the ZK Failover Controllers 48 | */ 49 | service ZKFCProtocolService { 50 | /** 51 | * Request that the service cede its active state, and quit the election 52 | * for some amount of time 53 | */ 54 | rpc cedeActive(CedeActiveRequestProto) 55 | returns(CedeActiveResponseProto); 56 | 57 | 58 | rpc gracefulFailover(GracefulFailoverRequestProto) 59 | returns(GracefulFailoverResponseProto); 60 | } 61 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/IpcConnectionContext.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ipc.protobuf"; 27 | option java_outer_classname = "IpcConnectionContextProtos"; 28 | option java_generate_equals_and_hash = true; 29 | package hadoop.common; 30 | 31 | /** 32 | * Spec for UserInformationProto is specified in ProtoUtil#makeIpcConnectionContext 33 | */ 34 | message UserInformationProto { 35 | optional string effectiveUser = 1; 36 | optional string realUser = 2; 37 | } 38 | 39 | /** 40 | * The connection context is sent as part of the connection establishment. 41 | * It establishes the context for ALL Rpc calls within the connection. 42 | */ 43 | message IpcConnectionContextProto { 44 | // UserInfo beyond what is determined as part of security handshake 45 | // at connection time (kerberos, tokens etc). 46 | optional UserInformationProto userInfo = 2; 47 | 48 | // Protocol name for next rpc layer. 49 | // The client created a proxy with this protocol name 50 | optional string protocol = 3; 51 | } 52 | -------------------------------------------------------------------------------- /pkg/yarn/client/examples/rm-update-node-resource/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "log" 21 | "os" 22 | 23 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 24 | yarnserver "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 25 | yarnclient "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 26 | yarnconf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 27 | ) 28 | 29 | func main() { 30 | // Create YarnConfiguration 31 | conf, _ := yarnconf.NewYarnConfiguration(os.Getenv("HADOOP_CONF_DIR"), "") 32 | 33 | // Create YarnAdminClient 34 | yarnAdminClient, _ := yarnclient.CreateYarnAdminClient(conf, nil) 35 | 36 | host := "core-1-3.c-f55b4f620febfd69.cn-zhangjiakou.emr.aliyuncs.com" 37 | port := int32(8041) 38 | vCores := int32(100) 39 | memoryMB := int64(10240) 40 | request := &yarnserver.UpdateNodeResourceRequestProto{ 41 | NodeResourceMap: []*hadoopyarn.NodeResourceMapProto{ 42 | { 43 | NodeId: &hadoopyarn.NodeIdProto{ 44 | Host: &host, 45 | Port: &port, 46 | }, 47 | ResourceOption: &hadoopyarn.ResourceOptionProto{ 48 | Resource: &hadoopyarn.ResourceProto{ 49 | Memory: &memoryMB, 50 | VirtualCores: &vCores, 51 | }, 52 | }, 53 | }, 54 | }, 55 | } 56 | response, err := yarnAdminClient.UpdateNodeResource(request) 57 | 58 | if err != nil { 59 | log.Fatal("yarnAdminClient.UpdateNodeResource ", err) 60 | } 61 | 62 | log.Printf("UpdateNodeResource response %v", response) 63 | } 64 | -------------------------------------------------------------------------------- /pkg/yarn/apis/service/ha_service.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package service 18 | 19 | import ( 20 | "encoding/json" 21 | "math" 22 | 23 | uuid "github.com/nu7hatch/gouuid" 24 | "google.golang.org/protobuf/proto" 25 | 26 | gohadoop "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/auth" 27 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 28 | hadoop_ipc_client "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client/ipc" 29 | ) 30 | 31 | // Reference proto, json, and math imports to suppress error if they are not otherwise used. 32 | var _ = proto.Marshal 33 | var _ = &json.SyntaxError{} 34 | var _ = math.Inf 35 | 36 | type HAServiceProtocolService interface { 37 | GetServiceStatus(in *hadoopcommon.GetServiceStatusRequestProto, out *hadoopcommon.GetServiceStatusResponseProto) error 38 | } 39 | 40 | var HA_SERVICE_PROTOCOL = "org.apache.hadoop.ha.HAServiceProtocol" 41 | 42 | type HAServiceProtocolServiceClient struct { 43 | *hadoop_ipc_client.Client 44 | } 45 | 46 | func (c *HAServiceProtocolServiceClient) GetServiceStatus(in *hadoopcommon.GetServiceStatusRequestProto, out *hadoopcommon.GetServiceStatusResponseProto) error { 47 | return c.Call(gohadoop.GetCalleeRPCRequestHeaderProto(&HA_SERVICE_PROTOCOL), in, out) 48 | } 49 | 50 | func DialHAServiceProtocolService(serverAddress string) (HAServiceProtocolService, error) { 51 | clientId, _ := uuid.NewV4() 52 | ugi, _ := gohadoop.CreateSimpleUGIProto() 53 | c := &hadoop_ipc_client.Client{ClientId: clientId, Ugi: ugi, ServerAddress: serverAddress} 54 | return &HAServiceProtocolServiceClient{c}, nil 55 | } 56 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: write 10 | packages: write 11 | 12 | jobs: 13 | build-and-push: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | target: [ yarn-operator ] 19 | env: 20 | ALIYUN_BJ_REG: registry.cn-beijing.aliyuncs.com 21 | ALIYUN_HZ_REG: registry.cn-hangzhou.aliyuncs.com 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v4 25 | - name: Set up QEMU 26 | uses: docker/setup-qemu-action@v3 27 | - name: Set up Docker Buildx 28 | uses: docker/setup-buildx-action@v3 29 | - name: Login to AliyunCS_BJ 30 | uses: docker/login-action@v3 31 | with: 32 | registry: ${{ env.ALIYUN_BJ_REG }} 33 | username: ${{ secrets.ALIYUN_USERNAME }} 34 | password: ${{ secrets.ALIYUN_PWD }} 35 | - name: Login to AliyunCS_HZ 36 | uses: docker/login-action@v3 37 | with: 38 | registry: ${{ env.ALIYUN_HZ_REG }} 39 | username: ${{ secrets.ALIYUN_USERNAME }} 40 | password: ${{ secrets.ALIYUN_PWD }} 41 | - name: Build and push 42 | uses: docker/build-push-action@v5 43 | with: 44 | platforms: linux/amd64,linux/arm64 45 | push: true 46 | pull: true 47 | file: docker/${{ matrix.target }}.dockerfile 48 | labels: | 49 | org.opencontainers.image.title=${{ matrix.target }} 50 | org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} 51 | org.opencontainers.image.revision=${{ github.sha }} 52 | org.opencontainers.image.created=${{ github.event.repository.updated_at}} 53 | org.opencontainers.image.licenses=Apache-2.0 54 | tags: | 55 | ${{ env.ALIYUN_BJ_REG }}/${{ github.repository_owner }}/${{ matrix.target }}:${{ github.ref_name }} 56 | ${{ env.ALIYUN_HZ_REG }}/${{ github.repository_owner }}/${{ matrix.target }}:${{ github.ref_name }} 57 | cache-from: type=gha,scope=build-${{ matrix.target }} 58 | cache-to: type=gha,mode=max,scope=build-${{ matrix.target }} -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/GenericRefreshProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ipc.proto"; 27 | option java_outer_classname = "GenericRefreshProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Refresh request. 34 | */ 35 | message GenericRefreshRequestProto { 36 | optional string identifier = 1; 37 | repeated string args = 2; 38 | } 39 | 40 | /** 41 | * A single response from a refresh handler. 42 | */ 43 | message GenericRefreshResponseProto { 44 | optional int32 exitStatus = 1; // unix exit status to return 45 | optional string userMessage = 2; // to be displayed to the user 46 | optional string senderName = 3; // which handler sent this message 47 | } 48 | 49 | /** 50 | * Collection of responses from zero or more handlers. 51 | */ 52 | message GenericRefreshResponseCollectionProto { 53 | repeated GenericRefreshResponseProto responses = 1; 54 | } 55 | 56 | /** 57 | * Protocol which is used to refresh a user-specified feature. 58 | */ 59 | service GenericRefreshProtocolService { 60 | rpc refresh(GenericRefreshRequestProto) 61 | returns(GenericRefreshResponseCollectionProto); 62 | } 63 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/RefreshUserMappingsProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.security.proto"; 27 | option java_outer_classname = "RefreshUserMappingsProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Refresh user to group mappings request. 34 | */ 35 | message RefreshUserToGroupsMappingsRequestProto { 36 | } 37 | 38 | /** 39 | * void response 40 | */ 41 | message RefreshUserToGroupsMappingsResponseProto { 42 | } 43 | 44 | /** 45 | * Refresh superuser configuration request. 46 | */ 47 | message RefreshSuperUserGroupsConfigurationRequestProto { 48 | } 49 | 50 | /** 51 | * void response 52 | */ 53 | message RefreshSuperUserGroupsConfigurationResponseProto { 54 | } 55 | 56 | /** 57 | * Protocol to refresh the user mappings. 58 | */ 59 | service RefreshUserMappingsProtocolService { 60 | /** 61 | * Refresh user to group mappings. 62 | */ 63 | rpc refreshUserToGroupsMappings(RefreshUserToGroupsMappingsRequestProto) 64 | returns(RefreshUserToGroupsMappingsResponseProto); 65 | 66 | /** 67 | * Refresh superuser proxy group list. 68 | */ 69 | rpc refreshSuperUserGroupsConfiguration(RefreshSuperUserGroupsConfigurationRequestProto) 70 | returns(RefreshSuperUserGroupsConfigurationResponseProto); 71 | } 72 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/membership.yml: -------------------------------------------------------------------------------- 1 | name: Organization Membership Request 2 | description: Request membership in a Koordinator Org 3 | labels: ["kind/github-membership"] 4 | title: "REQUEST: New membership for " 5 | body: 6 | - id: github 7 | type: input 8 | attributes: 9 | label: GitHub Username 10 | placeholder: e.g. @example_user 11 | validations: 12 | required: true 13 | - id: requirements 14 | type: checkboxes 15 | attributes: 16 | label: Requirements 17 | options: 18 | - label: I have reviewed the [community membership guidelines](https://github.com/koordinator-sh/koordinator/blob/main/docs/community/community-membership.md) 19 | required: true 20 | - label: I have [enabled 2FA on my GitHub account](https://github.com/settings/security) 21 | required: true 22 | - label: I am actively contributing to 1 or more Koordinator subprojects 23 | required: true 24 | - label: I have two sponsors that meet the sponsor requirements listed in the community membership guidelines 25 | required: true 26 | - label: I have spoken to my sponsors ahead of this application, and they have agreed to sponsor my application 27 | required: true 28 | - label: I have verified that my sponsors are a reviewer or an approver in at least one OWNERS file within one of the Koordinator GitHub organizations (excluding the contributor-playground) 29 | required: true 30 | - label: "**OPTIONAL:** I have taken the [Inclusive Open Source Community Orientation course](https://training.linuxfoundation.org/training/inclusive-open-source-community-orientation-lfc102/)" 31 | - id: sponsor_1 32 | type: input 33 | attributes: 34 | label: "Sponsor 1" 35 | description: GitHub handle of your sponsor 36 | placeholder: e.g. @sponsor-1 37 | validations: 38 | required: true 39 | - id: sponsor_2 40 | type: input 41 | attributes: 42 | label: "Sponsor 2" 43 | description: GitHub handle of your sponsor 44 | placeholder: e.g. @sponsor-2 45 | validations: 46 | required: true 47 | - id: contributions 48 | type: textarea 49 | attributes: 50 | label: List of contributions to the Koordinator project 51 | placeholder: | 52 | - PRs reviewed / authored 53 | - Issues responded to 54 | - Subprojects I am involved with 55 | validations: 56 | required: true 57 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/TraceAdmin.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.tracing"; 27 | option java_outer_classname = "TraceAdminPB"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | message ListSpanReceiversRequestProto { 33 | } 34 | 35 | message SpanReceiverListInfo { 36 | required int64 id = 1; 37 | required string className = 2; 38 | } 39 | 40 | message ListSpanReceiversResponseProto { 41 | repeated SpanReceiverListInfo descriptions = 1; 42 | } 43 | 44 | message ConfigPair { 45 | required string key = 1; 46 | required string value = 2; 47 | } 48 | 49 | message AddSpanReceiverRequestProto { 50 | required string className = 1; 51 | repeated ConfigPair config = 2; 52 | } 53 | 54 | message AddSpanReceiverResponseProto { 55 | required int64 id = 1; 56 | } 57 | 58 | message RemoveSpanReceiverRequestProto { 59 | required int64 id = 1; 60 | } 61 | 62 | message RemoveSpanReceiverResponseProto { 63 | } 64 | 65 | service TraceAdminService { 66 | rpc listSpanReceivers(ListSpanReceiversRequestProto) 67 | returns(ListSpanReceiversResponseProto); 68 | 69 | rpc addSpanReceiver(AddSpanReceiverRequestProto) 70 | returns(AddSpanReceiverResponseProto); 71 | 72 | rpc removeSpanReceiver(RemoveSpanReceiverRequestProto) 73 | returns(RemoveSpanReceiverResponseProto); 74 | } 75 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/Security.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.security.proto"; 27 | option java_outer_classname = "SecurityProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Security token identifier 34 | */ 35 | message TokenProto { 36 | required bytes identifier = 1; 37 | required bytes password = 2; 38 | required string kind = 3; 39 | required string service = 4; 40 | } 41 | 42 | message CredentialsKVProto { 43 | required string alias = 1; 44 | optional hadoop.common.TokenProto token = 2; 45 | optional bytes secret = 3; 46 | } 47 | 48 | message CredentialsProto { 49 | repeated hadoop.common.CredentialsKVProto tokens = 1; 50 | repeated hadoop.common.CredentialsKVProto secrets = 2; 51 | } 52 | 53 | message GetDelegationTokenRequestProto { 54 | required string renewer = 1; 55 | } 56 | 57 | message GetDelegationTokenResponseProto { 58 | optional hadoop.common.TokenProto token = 1; 59 | } 60 | 61 | message RenewDelegationTokenRequestProto { 62 | required hadoop.common.TokenProto token = 1; 63 | } 64 | 65 | message RenewDelegationTokenResponseProto { 66 | required uint64 newExpiryTime = 1; 67 | } 68 | 69 | message CancelDelegationTokenRequestProto { 70 | required hadoop.common.TokenProto token = 1; 71 | } 72 | 73 | message CancelDelegationTokenResponseProto { // void response 74 | } 75 | 76 | -------------------------------------------------------------------------------- /cmd/yarn-operator/options/options.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package options 18 | 19 | import ( 20 | "flag" 21 | "fmt" 22 | "strings" 23 | 24 | "github.com/spf13/pflag" 25 | "k8s.io/klog/v2" 26 | "sigs.k8s.io/controller-runtime/pkg/manager" 27 | ) 28 | 29 | type Options struct { 30 | ControllerAddFuncs map[string]func(manager.Manager) error 31 | Controllers []string 32 | } 33 | 34 | func NewOptions() *Options { 35 | return &Options{ 36 | ControllerAddFuncs: controllerAddFuncs, 37 | Controllers: controllerAddDefault, 38 | } 39 | } 40 | 41 | func (o *Options) InitFlags(fs *flag.FlagSet) { 42 | pflag.StringSliceVar(&o.Controllers, "controllers", o.Controllers, fmt.Sprintf("A list of controllers to enable. "+ 43 | "'-controllers=*' enables all controllers. "+ 44 | "'-controllers=yarnresource' means only the 'yarnresource' controller is enabled. "+ 45 | "'-controllers=*,-yarnresource' means all controllers except the 'yarnresource' controller are enabled.\n"+ 46 | "All controllers: %s", strings.Join(o.Controllers, ", "))) 47 | } 48 | 49 | func (o *Options) ApplyTo(m manager.Manager) error { 50 | for controllerName, addFn := range o.ControllerAddFuncs { 51 | if !isControllerEnabled(controllerName, o.Controllers) { 52 | klog.Warningf("controller %q is disabled", controllerName) 53 | continue 54 | } 55 | 56 | if err := addFn(m); err != nil { 57 | klog.Errorf("Unable to create controller %s, err: %v", controllerName, err) 58 | return err 59 | } else { 60 | klog.V(4).Infof("controller %q added", controllerName) 61 | } 62 | } 63 | 64 | return nil 65 | } 66 | 67 | func isControllerEnabled(controllerName string, controllers []string) bool { 68 | hasStar := false 69 | for _, c := range controllers { 70 | if c == controllerName { 71 | return true 72 | } 73 | if c == "-"+controllerName { 74 | return false 75 | } 76 | if c == "*" { 77 | hasStar = true 78 | } 79 | } 80 | return hasStar 81 | } 82 | -------------------------------------------------------------------------------- /pkg/copilot-agent/runtime/manager.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package runtime 18 | 19 | import ( 20 | "io/fs" 21 | "path/filepath" 22 | "sync" 23 | "time" 24 | 25 | "k8s.io/apimachinery/pkg/util/runtime" 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | var defaultManger *Manger 30 | 31 | type Manger struct { 32 | SocketDir string 33 | 34 | Plugins map[string]CustomRuntimePlugin 35 | 36 | mtx sync.RWMutex 37 | } 38 | 39 | func NewManger(socketDir string) *Manger { 40 | m := &Manger{SocketDir: socketDir, Plugins: map[string]CustomRuntimePlugin{}, mtx: sync.RWMutex{}} 41 | defaultManger = m 42 | return m 43 | } 44 | 45 | func (m *Manger) Run(stopCh <-chan struct{}) error { 46 | defer runtime.HandleCrash() 47 | tick := time.NewTicker(time.Minute) 48 | for { 49 | select { 50 | case <-tick.C: 51 | if err := m.run(); err != nil { 52 | klog.Warning(err) 53 | } 54 | case <-stopCh: 55 | break 56 | } 57 | } 58 | } 59 | 60 | func (m *Manger) run() error { 61 | 62 | klog.V(4).Info("watch socket path %s", m.SocketDir) 63 | err := filepath.Walk(m.SocketDir, func(path string, info fs.FileInfo, err error) error { 64 | if info.Mode().Type() != fs.ModeSocket { 65 | klog.V(4).Infof("%s is not socket", path) 66 | return nil 67 | } 68 | p := newHttpPlugin(path) 69 | pluginInfo, err1 := p.Info() 70 | if err1 != nil { 71 | return err1 72 | } 73 | klog.Infof("discover plugin %s", pluginInfo.Name) 74 | m.mtx.Lock() 75 | m.Plugins[pluginInfo.Name] = p 76 | m.mtx.Unlock() 77 | return nil 78 | }) 79 | if err != nil { 80 | return err 81 | } 82 | return nil 83 | } 84 | 85 | func (m *Manger) GetAliveCopilots() map[string]CustomRuntimePlugin { 86 | m.mtx.RLock() 87 | defer m.mtx.RUnlock() 88 | res := map[string]CustomRuntimePlugin{} 89 | for key, plugin := range m.Plugins { 90 | if plugin.IsAlive() { 91 | res[key] = plugin 92 | } 93 | } 94 | return res 95 | } 96 | 97 | func (m *Manger) GetCopilot(name string) (CustomRuntimePlugin, bool) { 98 | m.mtx.RLock() 99 | defer m.mtx.RUnlock() 100 | res, exist := m.Plugins[name] 101 | return res, exist 102 | } 103 | -------------------------------------------------------------------------------- /pkg/yarn/apis/service/applicationclient_service.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package service 18 | 19 | import ( 20 | "encoding/json" 21 | "math" 22 | 23 | uuid "github.com/nu7hatch/gouuid" 24 | "google.golang.org/protobuf/proto" 25 | 26 | gohadoop "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/auth" 27 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 28 | hadoop_ipc_client "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client/ipc" 29 | yarn_conf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 30 | ) 31 | 32 | // Reference proto, json, and math imports to suppress error if they are not otherwise used. 33 | var _ = proto.Marshal 34 | var _ = &json.SyntaxError{} 35 | var _ = math.Inf 36 | 37 | var APPLICATION_CLIENT_PROTOCOL = "org.apache.hadoop.yarn.api.ApplicationClientProtocolPB" 38 | 39 | func init() { 40 | 41 | } 42 | 43 | type ApplicationClientProtocolService interface { 44 | GetClusterNodes(in *hadoopyarn.GetClusterNodesRequestProto, out *hadoopyarn.GetClusterNodesResponseProto) error 45 | } 46 | 47 | var _ ApplicationClientProtocolService = &ApplicationClientProtocolServiceClient{} 48 | 49 | type ApplicationClientProtocolServiceClient struct { 50 | *hadoop_ipc_client.Client 51 | } 52 | 53 | func (c *ApplicationClientProtocolServiceClient) GetClusterNodes(in *hadoopyarn.GetClusterNodesRequestProto, out *hadoopyarn.GetClusterNodesResponseProto) error { 54 | return c.Call(gohadoop.GetCalleeRPCRequestHeaderProto(&APPLICATION_CLIENT_PROTOCOL), in, out) 55 | } 56 | 57 | func DialApplicationClientProtocolService(conf yarn_conf.YarnConfiguration, rmAddress *string) (ApplicationClientProtocolService, error) { 58 | clientId, err := uuid.NewV4() 59 | if err != nil { 60 | return nil, err 61 | } 62 | ugi, err := gohadoop.CreateSimpleUGIProto() 63 | if err != nil { 64 | return nil, err 65 | } 66 | 67 | var serverAddress string 68 | if rmAddress != nil { 69 | serverAddress = *rmAddress 70 | } else if serverAddress, err = conf.GetRMAddress(); err != nil { 71 | return nil, err 72 | } 73 | 74 | c := &hadoop_ipc_client.Client{ClientId: clientId, Ugi: ugi, ServerAddress: serverAddress} 75 | return &ApplicationClientProtocolServiceClient{c}, nil 76 | } 77 | -------------------------------------------------------------------------------- /config/manager/yarn-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | control-plane: koordinator 6 | name: koordinator-system 7 | --- 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | metadata: 11 | name: koord-yarn-operator 12 | namespace: koordinator-system 13 | labels: 14 | koord-app: koord-yarn-operator 15 | spec: 16 | replicas: 1 17 | selector: 18 | matchLabels: 19 | koord-app: koord-yarn-operator 20 | strategy: 21 | rollingUpdate: 22 | maxSurge: 100% 23 | maxUnavailable: 0 24 | type: RollingUpdate 25 | template: 26 | metadata: 27 | labels: 28 | koord-app: koord-yarn-operator 29 | spec: 30 | affinity: 31 | podAntiAffinity: 32 | preferredDuringSchedulingIgnoredDuringExecution: 33 | - podAffinityTerm: 34 | labelSelector: 35 | matchExpressions: 36 | - key: koord-app 37 | operator: In 38 | values: 39 | - koord-yarn-operator 40 | topologyKey: kubernetes.io/hostname 41 | weight: 100 42 | containers: 43 | - args: 44 | - --enable-leader-election 45 | - --metrics-addr=:8080 46 | - --health-probe-addr=:8000 47 | - --logtostderr=true 48 | - --leader-election-namespace=koordinator-system 49 | - --v=4 50 | command: 51 | - /koord-yarn-operator 52 | env: 53 | - name: POD_NAMESPACE 54 | valueFrom: 55 | fieldRef: 56 | apiVersion: v1 57 | fieldPath: metadata.namespace 58 | - name: HADOOP_CONF_DIR 59 | value: /etc/hadoop-conf 60 | image: registry.cn-beijing.aliyuncs.com/koordinator-sh/yarn-operator:fix-rpc-8843dea 61 | imagePullPolicy: Always 62 | name: yarn-operator 63 | ports: 64 | - containerPort: 9876 65 | name: webhook-server 66 | protocol: TCP 67 | - containerPort: 8080 68 | name: metrics 69 | protocol: TCP 70 | - containerPort: 8000 71 | name: health 72 | protocol: TCP 73 | resources: 74 | limits: 75 | cpu: "1" 76 | memory: 1Gi 77 | requests: 78 | cpu: 500m 79 | memory: 256Mi 80 | volumeMounts: 81 | - name: yarn-config-volume 82 | mountPath: /etc/hadoop-conf 83 | volumes: 84 | - name: yarn-config-volume 85 | configMap: 86 | name: yarn-config 87 | restartPolicy: Always 88 | serviceAccountName: koord-yarn-operator 89 | terminationGracePeriodSeconds: 10 90 | -------------------------------------------------------------------------------- /pkg/yarn/apis/service/resourcemanager_administration_service.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package service 18 | 19 | import ( 20 | "encoding/json" 21 | "math" 22 | 23 | uuid "github.com/nu7hatch/gouuid" 24 | "google.golang.org/protobuf/proto" 25 | 26 | gohadoop "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/auth" 27 | yarnserver "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 28 | hadoop_ipc_client "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client/ipc" 29 | yarn_conf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 30 | ) 31 | 32 | // Reference proto, json, and math imports to suppress error if they are not otherwise used. 33 | var _ = proto.Marshal 34 | var _ = &json.SyntaxError{} 35 | var _ = math.Inf 36 | 37 | var RESOURCE_MANAGER_ADMIN_PROTOCOL = "org.apache.hadoop.yarn.server.api.ResourceManagerAdministrationProtocolPB" 38 | 39 | func init() { 40 | } 41 | 42 | type ResourceManagerAdministrationProtocolService interface { 43 | UpdateNodeResource(in *yarnserver.UpdateNodeResourceRequestProto, out *yarnserver.UpdateNodeResourceResponseProto) error 44 | } 45 | 46 | type ResourceManagerAdministrationProtocolServiceClient struct { 47 | *hadoop_ipc_client.Client 48 | } 49 | 50 | func (c *ResourceManagerAdministrationProtocolServiceClient) UpdateNodeResource(in *yarnserver.UpdateNodeResourceRequestProto, out *yarnserver.UpdateNodeResourceResponseProto) error { 51 | return c.Call(gohadoop.GetCalleeRPCRequestHeaderProto(&RESOURCE_MANAGER_ADMIN_PROTOCOL), in, out) 52 | } 53 | 54 | func DialResourceManagerAdministrationProtocolService(conf yarn_conf.YarnConfiguration, rmAddress *string) (ResourceManagerAdministrationProtocolService, error) { 55 | clientId, err := uuid.NewV4() 56 | if err != nil { 57 | return nil, err 58 | } 59 | ugi, err := gohadoop.CreateSimpleUGIProto() 60 | if err != nil { 61 | return nil, err 62 | } 63 | 64 | var serverAddress string 65 | if rmAddress != nil { 66 | serverAddress = *rmAddress 67 | } else if serverAddress, err = conf.GetRMAdminAddress(); err != nil { 68 | return nil, err 69 | } 70 | 71 | c := &hadoop_ipc_client.Client{ClientId: clientId, Ugi: ugi, ServerAddress: serverAddress} 72 | return &ResourceManagerAdministrationProtocolServiceClient{c}, nil 73 | } 74 | -------------------------------------------------------------------------------- /cmd/yarn-copilot-agent/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "flag" 21 | "os" 22 | "time" 23 | 24 | statesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/impl" 25 | "k8s.io/klog/v2" 26 | "sigs.k8s.io/controller-runtime/pkg/manager/signals" 27 | 28 | "github.com/koordinator-sh/yarn-copilot/cmd/yarn-copilot-agent/options" 29 | "github.com/koordinator-sh/yarn-copilot/pkg/copilot-agent/nm" 30 | "github.com/koordinator-sh/yarn-copilot/pkg/copilot-agent/server" 31 | ) 32 | 33 | func main() { 34 | f := flag.NewFlagSet(os.Args[0], flag.ExitOnError) 35 | conf := options.NewConfiguration() 36 | klog.InitFlags(f) 37 | f.StringVar(&conf.ServerEndpoint, "server-endpoint", conf.ServerEndpoint, "yarn copilot server endpoint.") 38 | f.StringVar(&conf.YarnContainerCgroupPath, "yarn-container-cgroup-path", conf.YarnContainerCgroupPath, "yarn container cgroup path.") 39 | f.StringVar(&conf.NodeMangerEndpoint, "node-manager-endpoint", conf.NodeMangerEndpoint, "node manger endpoint") 40 | f.BoolVar(&conf.SyncMemoryCgroup, "sync-memory-cgroup", conf.SyncMemoryCgroup, "true to sync cpu cgroup info to memory, used for hadoop 2.x") 41 | f.DurationVar(&conf.SyncCgroupPeriod, "sync-cgroup-period", conf.SyncCgroupPeriod, "period of resync all cpu/memory cgroup") 42 | f.StringVar(&conf.CgroupRootDir, "cgroup-root-dir", conf.CgroupRootDir, "cgroup root directory") 43 | help := f.Bool("help", false, "help information") 44 | 45 | if err := f.Parse(os.Args[1:]); err != nil { 46 | klog.Fatal(err) 47 | } 48 | if *help { 49 | f.Usage() 50 | os.Exit(0) 51 | } 52 | f.VisitAll(func(f *flag.Flag) { 53 | klog.Infof("args: %s = %s", f.Name, f.Value) 54 | }) 55 | stopCtx := signals.SetupSignalHandler() 56 | kubelet, _ := statesinformer.NewKubeletStub("127.0.0.1", 10255, "http", time.Second*5, nil) 57 | operator, err := nm.NewNodeMangerOperator(conf.CgroupRootDir, conf.YarnContainerCgroupPath, conf.SyncMemoryCgroup, conf.NodeMangerEndpoint, conf.SyncCgroupPeriod, kubelet) 58 | if err != nil { 59 | klog.Fatal(err) 60 | } 61 | go func() { 62 | if err := operator.Run(stopCtx.Done()); err != nil { 63 | klog.Error(err) 64 | } 65 | }() 66 | err = server.NewYarnCopilotServer(operator, conf.ServerEndpoint).Run(stopCtx) 67 | if err != nil { 68 | klog.Fatal(err) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - release-* 8 | pull_request: { } 9 | workflow_dispatch: { } 10 | 11 | jobs: 12 | golangci-lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-go@v4 17 | with: 18 | cache: false 19 | go-version-file: go.mod 20 | - uses: golangci/golangci-lint-action@v3 21 | with: 22 | version: v1.50.1 23 | args: --timeout 10m0s 24 | 25 | unit-tests: 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | include: 30 | - step: mod 31 | name: Check Go modules 32 | command: go mod tidy && git add go.* && git diff --cached --exit-code || (echo 'Please run "go mod tidy" to sync Go modules' && exit 1); 33 | - step: manifests 34 | name: Check manifests 35 | command: make manifests && git add config && git diff --cached --exit-code || (echo 'Please run "make manifests" to generate manifests' && exit 1); 36 | - step: generate 37 | name: Check auto-generated codes 38 | command: make generate && git add pkg apis && git diff --cached --exit-code || (echo 'Please run "make generate" to generate Go codes' && exit 1); 39 | - step: gofmt 40 | name: Verify gofmt 41 | command: make fmt && git add apis pkg cmd && git diff --cached --exit-code || (echo 'Please run "make fmt" to verify gofmt' && exit 1); 42 | - step: govet 43 | name: Verify govet 44 | command: make vet && git add apis pkg cmd && git diff --cached --exit-code || (echo 'Please run "make vet" to verify govet' && exit 1); 45 | - step: build 46 | name: Run Go build 47 | command: make build 48 | - step: test 49 | name: Run Go test 50 | command: make fast-test 51 | name: unit-tests(${{ matrix.name }}) 52 | runs-on: ubuntu-latest 53 | steps: 54 | - uses: actions/checkout@v4 55 | - uses: actions/setup-go@v4 56 | with: 57 | cache: false 58 | go-version-file: go.mod 59 | - uses: actions/cache@v4 60 | with: 61 | path: | 62 | ~/.cache/go-build 63 | ~/go/pkg/mod 64 | key: ${{ runner.os }}-ut-${{ matrix.step }}-${{ hashFiles('**/go.sum') }} 65 | restore-keys: | 66 | ${{ runner.os }}-ut-${{ matrix.step }}- 67 | - name: ${{ matrix.name }} 68 | run: | 69 | ${{ matrix.command }} 70 | - if: matrix.step == 'test' 71 | name: Upload coverage to Codecov 72 | uses: codecov/codecov-action@v4 73 | env: 74 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 75 | with: 76 | token: ${{ secrets.CODECOV_TOKEN }} 77 | flags: unittests 78 | file: cover.out 79 | fail_ci_if_error: true 80 | -------------------------------------------------------------------------------- /pkg/controller/metrics/yarn_collector.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | import ( 20 | "github.com/prometheus/client_golang/prometheus" 21 | 22 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/cache" 23 | ) 24 | 25 | var ( 26 | yarnNodeCPUMetric = prometheus.NewDesc( 27 | yarnNodeCPUResource, 28 | "yarn node cpu resource", 29 | []string{"instance", "cluster"}, 30 | nil) 31 | yarnNodeMemoryMetric = prometheus.NewDesc( 32 | yarnNodeMemoryResource, 33 | "yarn node memory resource", 34 | []string{"instance", "cluster"}, 35 | nil) 36 | yarnNodeCPUAllocatedMetric = prometheus.NewDesc( 37 | yarnNodeCPUAllocatedResource, 38 | "yarn node cpu resource", 39 | []string{"instance", "cluster"}, 40 | nil) 41 | yarnNodeMemoryAllocatedMetric = prometheus.NewDesc( 42 | yarnNodeMemoryAllocatedResource, 43 | "yarn node memory resource", 44 | []string{"instance", "cluster"}, 45 | nil) 46 | ) 47 | 48 | type YarnMetricCollector struct { 49 | cache *cache.NodesSyncer 50 | } 51 | 52 | func NewYarnMetricCollector(cache *cache.NodesSyncer) *YarnMetricCollector { 53 | return &YarnMetricCollector{cache: cache} 54 | } 55 | 56 | func (y *YarnMetricCollector) Describe(descs chan<- *prometheus.Desc) { 57 | descs <- yarnNodeCPUMetric 58 | descs <- yarnNodeMemoryMetric 59 | descs <- yarnNodeCPUAllocatedMetric 60 | descs <- yarnNodeMemoryAllocatedMetric 61 | } 62 | 63 | func (y *YarnMetricCollector) Collect(metrics chan<- prometheus.Metric) { 64 | for clusterID, nodes := range y.cache.GetYarnNodeInfo() { 65 | for _, node := range nodes { 66 | metrics <- prometheus.MustNewConstMetric( 67 | yarnNodeCPUMetric, 68 | prometheus.GaugeValue, 69 | float64(node.Capability.GetVirtualCores()), 70 | node.NodeId.GetHost(), 71 | clusterID, 72 | ) 73 | metrics <- prometheus.MustNewConstMetric( 74 | yarnNodeMemoryMetric, 75 | prometheus.GaugeValue, 76 | float64(node.Capability.GetMemory()*1024*1024), 77 | node.NodeId.GetHost(), 78 | clusterID, 79 | ) 80 | metrics <- prometheus.MustNewConstMetric( 81 | yarnNodeCPUAllocatedMetric, 82 | prometheus.GaugeValue, 83 | float64(node.Used.GetVirtualCores()), 84 | node.NodeId.GetHost(), 85 | clusterID, 86 | ) 87 | metrics <- prometheus.MustNewConstMetric( 88 | yarnNodeMemoryAllocatedMetric, 89 | prometheus.GaugeValue, 90 | float64(node.Used.GetMemory()*1024*1024), 91 | node.NodeId.GetHost(), 92 | clusterID, 93 | ) 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/ProtocolInfo.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ipc.protobuf"; 27 | option java_outer_classname = "ProtocolInfoProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | /** 33 | * Request to get protocol versions for all supported rpc kinds. 34 | */ 35 | message GetProtocolVersionsRequestProto { 36 | required string protocol = 1; // Protocol name 37 | } 38 | 39 | /** 40 | * Protocol version with corresponding rpc kind. 41 | */ 42 | message ProtocolVersionProto { 43 | required string rpcKind = 1; //RPC kind 44 | repeated uint64 versions = 2; //Protocol version corresponding to the rpc kind. 45 | } 46 | 47 | /** 48 | * Get protocol version response. 49 | */ 50 | message GetProtocolVersionsResponseProto { 51 | repeated ProtocolVersionProto protocolVersions = 1; 52 | } 53 | 54 | /** 55 | * Get protocol signature request. 56 | */ 57 | message GetProtocolSignatureRequestProto { 58 | required string protocol = 1; // Protocol name 59 | required string rpcKind = 2; // RPC kind 60 | } 61 | 62 | /** 63 | * Get protocol signature response. 64 | */ 65 | message GetProtocolSignatureResponseProto { 66 | repeated ProtocolSignatureProto protocolSignature = 1; 67 | } 68 | 69 | message ProtocolSignatureProto { 70 | required uint64 version = 1; 71 | repeated uint32 methods = 2; 72 | } 73 | 74 | /** 75 | * Protocol to get information about protocols. 76 | */ 77 | service ProtocolInfoService { 78 | /** 79 | * Return protocol version corresponding to protocol interface for each 80 | * supported rpc kind. 81 | */ 82 | rpc getProtocolVersions(GetProtocolVersionsRequestProto) 83 | returns (GetProtocolVersionsResponseProto); 84 | 85 | /** 86 | * Return protocol version corresponding to protocol interface. 87 | */ 88 | rpc getProtocolSignature(GetProtocolSignatureRequestProto) 89 | returns (GetProtocolSignatureResponseProto); 90 | } 91 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/ProtobufRpcEngine.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | /** 26 | * These are the messages used by Hadoop RPC for the Rpc Engine Protocol Buffer 27 | * to marshal the request and response in the RPC layer. 28 | * The messages are sent in addition to the normal RPC header as 29 | * defined in RpcHeader.proto 30 | */ 31 | 32 | syntax = "proto2"; 33 | option java_package = "org.apache.hadoop.ipc.protobuf"; 34 | option java_outer_classname = "ProtobufRpcEngineProtos"; 35 | option java_generate_equals_and_hash = true; 36 | package hadoop.common; 37 | 38 | /** 39 | * This message is the header for the Protobuf Rpc Engine 40 | * when sending a RPC request from RPC client to the RPC server. 41 | * The actual request (serialized as protobuf) follows this request. 42 | * 43 | * No special header is needed for the Rpc Response for Protobuf Rpc Engine. 44 | * The normal RPC response header (see RpcHeader.proto) are sufficient. 45 | */ 46 | message RequestHeaderProto { 47 | /** Name of the RPC method */ 48 | required string methodName = 1; 49 | 50 | /** 51 | * RPCs for a particular interface (ie protocol) are done using a 52 | * IPC connection that is setup using rpcProxy. 53 | * The rpcProxy's has a declared protocol name that is 54 | * sent form client to server at connection time. 55 | * 56 | * Each Rpc call also sends a protocol name 57 | * (called declaringClassprotocolName). This name is usually the same 58 | * as the connection protocol name except in some cases. 59 | * For example metaProtocols such ProtocolInfoProto which get metainfo 60 | * about the protocol reuse the connection but need to indicate that 61 | * the actual protocol is different (i.e. the protocol is 62 | * ProtocolInfoProto) since they reuse the connection; in this case 63 | * the declaringClassProtocolName field is set to the ProtocolInfoProto 64 | */ 65 | required string declaringClassProtocolName = 2; 66 | 67 | /** protocol version of class declaring the called method */ 68 | required uint64 clientProtocolVersion = 3; 69 | } 70 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/FSProtos.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.fs"; 27 | option java_outer_classname = "FSProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.fs; 31 | 32 | message FsPermissionProto { 33 | required uint32 perm = 1; // UNIX-style mode bits 34 | } 35 | 36 | /* 37 | * FileStatus encoding. Field IDs match those from HdfsFileStatusProto, but 38 | * cross-serialization is not an explicitly supported use case. Unlike HDFS, 39 | * most fields are optional and do not define defaults. 40 | */ 41 | message FileStatusProto { 42 | enum FileType { 43 | FT_DIR = 1; 44 | FT_FILE = 2; 45 | FT_SYMLINK = 3; 46 | } 47 | enum Flags { 48 | HAS_ACL = 0x01; // has ACLs 49 | HAS_CRYPT = 0x02; // encrypted 50 | HAS_EC = 0x04; // erasure coded 51 | SNAPSHOT_ENABLED = 0x08; // snapshot enabled 52 | } 53 | required FileType fileType = 1; 54 | required string path = 2; 55 | optional uint64 length = 3; 56 | optional FsPermissionProto permission = 4; 57 | optional string owner = 5; 58 | optional string group = 6; 59 | optional uint64 modification_time = 7; 60 | optional uint64 access_time = 8; 61 | optional string symlink = 9; 62 | optional uint32 block_replication = 10; 63 | optional uint64 block_size = 11; 64 | // locations = 12 65 | // alias = 13 66 | // childrenNum = 14 67 | optional bytes encryption_data = 15; 68 | // storagePolicy = 16 69 | optional bytes ec_data = 17; 70 | optional uint32 flags = 18 [default = 0]; 71 | } 72 | 73 | /** 74 | * Placeholder type for consistent basic FileSystem operations. 75 | */ 76 | message LocalFileSystemPathHandleProto { 77 | optional uint64 mtime = 1; 78 | optional string path = 2; 79 | } 80 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopyarn/server/resourcemanager_administration_protocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.yarn.proto"; 27 | option java_outer_classname = "ResourceManagerAdministrationProtocol"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.yarn; 31 | 32 | import "yarn_server_resourcemanager_service_protos.proto"; 33 | 34 | service ResourceManagerAdministrationProtocolService { 35 | rpc refreshQueues(RefreshQueuesRequestProto) returns (RefreshQueuesResponseProto); 36 | rpc refreshNodes(RefreshNodesRequestProto) returns (RefreshNodesResponseProto); 37 | rpc refreshSuperUserGroupsConfiguration(RefreshSuperUserGroupsConfigurationRequestProto) returns (RefreshSuperUserGroupsConfigurationResponseProto); 38 | rpc refreshUserToGroupsMappings(RefreshUserToGroupsMappingsRequestProto) returns (RefreshUserToGroupsMappingsResponseProto); 39 | rpc refreshAdminAcls(RefreshAdminAclsRequestProto) returns (RefreshAdminAclsResponseProto); 40 | rpc refreshServiceAcls(RefreshServiceAclsRequestProto) returns (RefreshServiceAclsResponseProto); 41 | rpc getGroupsForUser(GetGroupsForUserRequestProto) returns (GetGroupsForUserResponseProto); 42 | rpc updateNodeResource(UpdateNodeResourceRequestProto) returns (UpdateNodeResourceResponseProto); 43 | rpc refreshNodesResources(RefreshNodesResourcesRequestProto) returns (RefreshNodesResourcesResponseProto); 44 | rpc addToClusterNodeLabels(AddToClusterNodeLabelsRequestProto) returns (AddToClusterNodeLabelsResponseProto); 45 | rpc removeFromClusterNodeLabels(RemoveFromClusterNodeLabelsRequestProto) returns (RemoveFromClusterNodeLabelsResponseProto); 46 | rpc replaceLabelsOnNodes(ReplaceLabelsOnNodeRequestProto) returns (ReplaceLabelsOnNodeResponseProto); 47 | rpc checkForDecommissioningNodes(CheckForDecommissioningNodesRequestProto) returns (CheckForDecommissioningNodesResponseProto); 48 | rpc refreshClusterMaxPriority(RefreshClusterMaxPriorityRequestProto) returns (RefreshClusterMaxPriorityResponseProto); 49 | rpc mapAttributesToNodes(NodesToAttributesMappingRequestProto) returns (NodesToAttributesMappingResponseProto); 50 | } 51 | -------------------------------------------------------------------------------- /pkg/yarn/client/factory.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | "io/fs" 21 | "os" 22 | "path/filepath" 23 | "strings" 24 | 25 | "k8s.io/klog/v2" 26 | ) 27 | 28 | const ( 29 | envHadoopConfDir = "HADOOP_CONF_DIR" 30 | 31 | DefaultClusterID = "__default_yarn_cluster__" 32 | ) 33 | 34 | type YarnClientFactory interface { 35 | CreateDefaultYarnClient() (YarnClient, error) 36 | CreateYarnClientByClusterID(clusterID string) (YarnClient, error) 37 | CreateAllYarnClients() (map[string]YarnClient, error) 38 | } 39 | 40 | var DefaultYarnClientFactory YarnClientFactory = &yarnClientFactory{configDir: os.Getenv(envHadoopConfDir)} 41 | 42 | type yarnClientFactory struct { 43 | configDir string 44 | } 45 | 46 | func (f *yarnClientFactory) CreateDefaultYarnClient() (YarnClient, error) { 47 | c := NewYarnClient(f.configDir, "") 48 | if err := c.Initialize(); err != nil { 49 | return nil, err 50 | } 51 | return c, nil 52 | } 53 | 54 | func (f *yarnClientFactory) CreateYarnClientByClusterID(clusterID string) (YarnClient, error) { 55 | c := NewYarnClient(f.configDir, clusterID) 56 | if err := c.Initialize(); err != nil { 57 | return nil, err 58 | } 59 | return c, nil 60 | } 61 | 62 | func (f *yarnClientFactory) CreateAllYarnClients() (map[string]YarnClient, error) { 63 | ids, err := f.getAllKnownClusterID() 64 | if err != nil { 65 | return nil, err 66 | } 67 | clients := map[string]YarnClient{} 68 | for _, id := range ids { 69 | yClient, err := f.CreateYarnClientByClusterID(id) 70 | if err != nil { 71 | klog.Errorf("create yarn client %v failed, error %v", id, err) 72 | return nil, err 73 | } 74 | clients[id] = yClient 75 | klog.V(3).Infof("init yarn client %v", id) 76 | } 77 | if defaultClient, err := f.CreateDefaultYarnClient(); err == nil { 78 | clients[DefaultClusterID] = defaultClient 79 | klog.V(3).Infof("init yarn client %v", defaultClient) 80 | } else { 81 | klog.Errorf("create yarn client %v failed, error %v", DefaultClusterID, err) 82 | return nil, err 83 | } 84 | return clients, nil 85 | } 86 | 87 | func (f *yarnClientFactory) getAllKnownClusterID() ([]string, error) { 88 | res := []string{} 89 | err := filepath.WalkDir(f.configDir, func(path string, d fs.DirEntry, err error) error { 90 | if d.IsDir() { 91 | return nil 92 | } 93 | if strings.HasSuffix(d.Name(), ".yarn-site.xml") { 94 | res = append(res, strings.ReplaceAll(d.Name(), ".yarn-site.xml", "")) 95 | } 96 | return nil 97 | }) 98 | if err != nil { 99 | return nil, err 100 | } 101 | return res, nil 102 | } 103 | -------------------------------------------------------------------------------- /pkg/yarn/apis/security/ugi.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2013 The Cloudera Inc. 3 | Copyright 2023 The Koordinator Authors. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | package security 19 | 20 | import ( 21 | "os/user" 22 | "sync" 23 | 24 | "k8s.io/klog/v2" 25 | 26 | hadoop_common "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 27 | ) 28 | 29 | /** a (very) basic UserGroupInformation implementation for storing user data/tokens, 30 | This implementation is currently *not* thread-safe 31 | */ 32 | 33 | type UserGroupInformation struct { 34 | // rwMutex sync.RWMutex 35 | userInfo *hadoop_common.UserInformationProto 36 | userTokens map[string]*hadoop_common.TokenProto 37 | } 38 | 39 | var once sync.Once 40 | var currentUserGroupInformation *UserGroupInformation 41 | var maxTokens = 16 42 | 43 | func CreateCurrentUserInfoProto() (*hadoop_common.UserInformationProto, error) { 44 | // Figure the current user-name 45 | var username string 46 | if currentUser, err := user.Current(); err != nil { 47 | klog.Warningf("user.Current", err) 48 | return nil, err 49 | } else { 50 | username = currentUser.Username 51 | } 52 | 53 | return &hadoop_common.UserInformationProto{EffectiveUser: nil, RealUser: &username}, nil 54 | } 55 | 56 | func Allocate(userInfo *hadoop_common.UserInformationProto, userTokens map[string]*hadoop_common.TokenProto) *UserGroupInformation { 57 | ugi := new(UserGroupInformation) 58 | 59 | if userInfo != nil { 60 | ugi.userInfo = userInfo 61 | } else { 62 | currentUserInfo, _ := CreateCurrentUserInfoProto() 63 | ugi.userInfo = currentUserInfo 64 | } 65 | 66 | if userTokens != nil { 67 | ugi.userTokens = userTokens 68 | } else { 69 | ugi.userTokens = make(map[string]*hadoop_common.TokenProto) //empty, with room for maxTokens tokens. 70 | } 71 | 72 | return ugi 73 | } 74 | 75 | func initializeCurrentUser() { 76 | once.Do(func() { 77 | currentUserGroupInformation = Allocate(nil, nil) 78 | }) 79 | } 80 | 81 | func (ugi *UserGroupInformation) GetUserInformation() *hadoop_common.UserInformationProto { 82 | return ugi.userInfo 83 | } 84 | 85 | func (ugi *UserGroupInformation) GetUserTokens() map[string]*hadoop_common.TokenProto { 86 | return ugi.userTokens 87 | } 88 | 89 | func (ugi *UserGroupInformation) AddUserTokenWithAlias(alias string, token *hadoop_common.TokenProto) { 90 | if token == nil { 91 | klog.Warningf("supplied token is nil!") 92 | return 93 | } 94 | 95 | if length := len(ugi.userTokens); length < maxTokens { 96 | ugi.userTokens[alias] = token 97 | } else { 98 | klog.Warningf("user already has maxTokens:", maxTokens) 99 | } 100 | } 101 | 102 | func (ugi *UserGroupInformation) AddUserToken(token *hadoop_common.TokenProto) { 103 | if token == nil { 104 | klog.Warningf("supplied token is nil!") 105 | return 106 | } 107 | 108 | ugi.AddUserTokenWithAlias(token.GetService(), token) 109 | } 110 | 111 | func GetCurrentUser() *UserGroupInformation { 112 | initializeCurrentUser() 113 | 114 | return currentUserGroupInformation 115 | } 116 | -------------------------------------------------------------------------------- /pkg/yarn/apis/auth/auth.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2013 The Cloudera Inc. 3 | Copyright 2023 The Koordinator Authors. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | package auth 19 | 20 | import ( 21 | "bytes" 22 | "encoding/binary" 23 | "os/user" 24 | "runtime" 25 | "strings" 26 | "unicode" 27 | 28 | "k8s.io/klog/v2" 29 | 30 | hadoop_common "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 31 | ) 32 | 33 | var ( 34 | RPC_HEADER []byte = []byte("hrpc") 35 | VERSION []byte = []byte{0x09} 36 | RPC_SERVICE_CLASS byte = 0x00 37 | 38 | RPC_PROTOCOL_BUFFFER hadoop_common.RpcKindProto = hadoop_common.RpcKindProto_RPC_PROTOCOL_BUFFER 39 | RPC_FINAL_PACKET hadoop_common.RpcRequestHeaderProto_OperationProto = hadoop_common.RpcRequestHeaderProto_RPC_FINAL_PACKET 40 | RPC_DEFAULT_RETRY_COUNT int32 = hadoop_common.Default_RpcRequestHeaderProto_RetryCount 41 | CLIENT_PROTOCOL_VERSION uint64 = 1 42 | ) 43 | 44 | type AuthMethod byte 45 | 46 | const ( 47 | AUTH_SIMPLE AuthMethod = 0x50 48 | AUTH_KERBEROS AuthMethod = 0x51 49 | AUTH_TOKEN AuthMethod = 0x52 50 | AUTH_PLAIN AuthMethod = 0x53 51 | ) 52 | 53 | func (authmethod AuthMethod) String() string { 54 | switch { 55 | case authmethod == AUTH_SIMPLE: 56 | return "SIMPLE" 57 | case authmethod == AUTH_KERBEROS: 58 | return "GSSAPI" 59 | case authmethod == AUTH_TOKEN: 60 | return "DIGEST-MD5" 61 | case authmethod == AUTH_PLAIN: 62 | return "PLAIN" 63 | } 64 | return "ERROR-UNKNOWN" 65 | } 66 | 67 | type AuthProtocol byte 68 | 69 | const ( 70 | AUTH_PROTOCOL_NONE AuthProtocol = 0x00 71 | AUTH_PROTOCOL_SASL AuthProtocol = 0xDF 72 | ) 73 | 74 | func (authprotocol AuthProtocol) String() string { 75 | switch { 76 | case authprotocol == AUTH_PROTOCOL_NONE: 77 | return "NONE" 78 | case authprotocol == AUTH_PROTOCOL_SASL: 79 | return "SASL" 80 | } 81 | return "ERROR-UNKNOWN" 82 | } 83 | 84 | func ConvertFixedToBytes(data interface{}) ([]byte, error) { 85 | buf := new(bytes.Buffer) 86 | err := binary.Write(buf, binary.BigEndian, data) 87 | return buf.Bytes(), err 88 | } 89 | 90 | func ConvertBytesToFixed(rawBytes []byte, data interface{}) error { 91 | buf := bytes.NewBuffer(rawBytes) 92 | err := binary.Read(buf, binary.BigEndian, data) 93 | return err 94 | } 95 | 96 | func GetCalleeRPCRequestHeaderProto(protocolName *string) *hadoop_common.RequestHeaderProto { 97 | pc, _, _, _ := runtime.Caller(1) // Callee Method Name 98 | fullName := runtime.FuncForPC(pc).Name() 99 | names := strings.Split(fullName, ".") 100 | unicodeName := []rune(names[len(names)-1]) 101 | unicodeName[0] = unicode.ToLower(unicodeName[0]) 102 | methodName := string(unicodeName) 103 | return &hadoop_common.RequestHeaderProto{MethodName: &methodName, DeclaringClassProtocolName: protocolName, ClientProtocolVersion: &CLIENT_PROTOCOL_VERSION} 104 | } 105 | 106 | func CreateSimpleUGIProto() (*hadoop_common.UserInformationProto, error) { 107 | // Figure the current user-name 108 | var username string 109 | if user, err := user.Current(); err != nil { 110 | klog.Warningf("user.Current", err) 111 | return nil, err 112 | } else { 113 | username = user.Username 114 | } 115 | 116 | return &hadoop_common.UserInformationProto{EffectiveUser: nil, RealUser: &username}, nil 117 | } 118 | -------------------------------------------------------------------------------- /pkg/yarn/client/mockclient/mock_factory.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // 17 | 18 | // Code generated by MockGen. DO NOT EDIT. 19 | // Source: pkg/yarn/client/factory.go 20 | 21 | // Package mock_client is a generated GoMock package. 22 | package mock_client 23 | 24 | import ( 25 | reflect "reflect" 26 | 27 | gomock "github.com/golang/mock/gomock" 28 | client "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 29 | ) 30 | 31 | // MockYarnClientFactory is a mock of YarnClientFactory interface. 32 | type MockYarnClientFactory struct { 33 | ctrl *gomock.Controller 34 | recorder *MockYarnClientFactoryMockRecorder 35 | } 36 | 37 | // MockYarnClientFactoryMockRecorder is the mock recorder for MockYarnClientFactory. 38 | type MockYarnClientFactoryMockRecorder struct { 39 | mock *MockYarnClientFactory 40 | } 41 | 42 | // NewMockYarnClientFactory creates a new mock instance. 43 | func NewMockYarnClientFactory(ctrl *gomock.Controller) *MockYarnClientFactory { 44 | mock := &MockYarnClientFactory{ctrl: ctrl} 45 | mock.recorder = &MockYarnClientFactoryMockRecorder{mock} 46 | return mock 47 | } 48 | 49 | // EXPECT returns an object that allows the caller to indicate expected use. 50 | func (m *MockYarnClientFactory) EXPECT() *MockYarnClientFactoryMockRecorder { 51 | return m.recorder 52 | } 53 | 54 | // CreateAllYarnClients mocks base method. 55 | func (m *MockYarnClientFactory) CreateAllYarnClients() (map[string]client.YarnClient, error) { 56 | m.ctrl.T.Helper() 57 | ret := m.ctrl.Call(m, "CreateAllYarnClients") 58 | ret0, _ := ret[0].(map[string]client.YarnClient) 59 | ret1, _ := ret[1].(error) 60 | return ret0, ret1 61 | } 62 | 63 | // CreateAllYarnClients indicates an expected call of CreateAllYarnClients. 64 | func (mr *MockYarnClientFactoryMockRecorder) CreateAllYarnClients() *gomock.Call { 65 | mr.mock.ctrl.T.Helper() 66 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateAllYarnClients", reflect.TypeOf((*MockYarnClientFactory)(nil).CreateAllYarnClients)) 67 | } 68 | 69 | // CreateDefaultYarnClient mocks base method. 70 | func (m *MockYarnClientFactory) CreateDefaultYarnClient() (client.YarnClient, error) { 71 | m.ctrl.T.Helper() 72 | ret := m.ctrl.Call(m, "CreateDefaultYarnClient") 73 | ret0, _ := ret[0].(client.YarnClient) 74 | ret1, _ := ret[1].(error) 75 | return ret0, ret1 76 | } 77 | 78 | // CreateDefaultYarnClient indicates an expected call of CreateDefaultYarnClient. 79 | func (mr *MockYarnClientFactoryMockRecorder) CreateDefaultYarnClient() *gomock.Call { 80 | mr.mock.ctrl.T.Helper() 81 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateDefaultYarnClient", reflect.TypeOf((*MockYarnClientFactory)(nil).CreateDefaultYarnClient)) 82 | } 83 | 84 | // CreateYarnClientByClusterID mocks base method. 85 | func (m *MockYarnClientFactory) CreateYarnClientByClusterID(clusterID string) (client.YarnClient, error) { 86 | m.ctrl.T.Helper() 87 | ret := m.ctrl.Call(m, "CreateYarnClientByClusterID", clusterID) 88 | ret0, _ := ret[0].(client.YarnClient) 89 | ret1, _ := ret[1].(error) 90 | return ret0, ret1 91 | } 92 | 93 | // CreateYarnClientByClusterID indicates an expected call of CreateYarnClientByClusterID. 94 | func (mr *MockYarnClientFactoryMockRecorder) CreateYarnClientByClusterID(clusterID interface{}) *gomock.Call { 95 | mr.mock.ctrl.T.Helper() 96 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateYarnClientByClusterID", reflect.TypeOf((*MockYarnClientFactory)(nil).CreateYarnClientByClusterID), clusterID) 97 | } 98 | -------------------------------------------------------------------------------- /pkg/controller/noderesource/yarn_resource_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package noderesource 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | "k8s.io/apimachinery/pkg/api/resource" 22 | "reflect" 23 | "testing" 24 | ) 25 | 26 | func TestGetOriginExtendAllocatable(t *testing.T) { 27 | type args struct { 28 | annotations map[string]string 29 | } 30 | tests := []struct { 31 | name string 32 | args args 33 | want corev1.ResourceList 34 | wantErr bool 35 | }{ 36 | { 37 | name: "annotation not exist", 38 | args: args{ 39 | annotations: map[string]string{}, 40 | }, 41 | want: nil, 42 | wantErr: false, 43 | }, 44 | { 45 | name: "bad annotation format", 46 | args: args{ 47 | annotations: map[string]string{ 48 | NodeOriginExtendedAllocatableAnnotationKey: "bad-format", 49 | }, 50 | }, 51 | want: nil, 52 | wantErr: true, 53 | }, 54 | { 55 | name: "get from annotation succ", 56 | args: args{ 57 | annotations: map[string]string{ 58 | NodeOriginExtendedAllocatableAnnotationKey: "{\"resources\": {\"kubernetes.io/batch-cpu\": 1000,\"kubernetes.io/batch-memory\": 1024}}", 59 | }, 60 | }, 61 | want: map[corev1.ResourceName]resource.Quantity{ 62 | BatchCPU: resource.MustParse("1000"), 63 | BatchMemory: resource.MustParse("1024"), 64 | }, 65 | wantErr: false, 66 | }, 67 | } 68 | for _, tt := range tests { 69 | t.Run(tt.name, func(t *testing.T) { 70 | got, err := GetOriginExtendedAllocatableRes(tt.args.annotations) 71 | if (err != nil) != tt.wantErr { 72 | t.Errorf("GetOriginExtendAllocatableRes() error = %v, wantErr %v", err, tt.wantErr) 73 | return 74 | } 75 | if !reflect.DeepEqual(got, tt.want) { 76 | t.Errorf("GetOriginExtendAllocatableRes() got = %v, want %v", got, tt.want) 77 | } 78 | }) 79 | } 80 | } 81 | 82 | func TestGetNodeAllocated(t *testing.T) { 83 | type args struct { 84 | annotations map[string]string 85 | } 86 | tests := []struct { 87 | name string 88 | args args 89 | want corev1.ResourceList 90 | wantErr bool 91 | }{ 92 | { 93 | name: "annotation not exist", 94 | args: args{ 95 | annotations: map[string]string{}, 96 | }, 97 | want: nil, 98 | wantErr: false, 99 | }, 100 | { 101 | name: "bad annotation format", 102 | args: args{ 103 | annotations: map[string]string{ 104 | NodeThirdPartyAllocationsAnnotationKey: "bad-format", 105 | }, 106 | }, 107 | want: nil, 108 | wantErr: true, 109 | }, 110 | { 111 | name: "get from annotation succ", 112 | args: args{ 113 | annotations: map[string]string{ 114 | NodeThirdPartyAllocationsAnnotationKey: "{\"allocations\":[{\"name\":\"hadoop-yarn\",\"priority\":\"koord-batch\",\"resources\":{\"kubernetes.io/batch-cpu\":\"1000\",\"kubernetes.io/batch-memory\":\"1024\"}}]}", 115 | }, 116 | }, 117 | want: corev1.ResourceList{ 118 | BatchCPU: resource.MustParse("1000"), 119 | BatchMemory: resource.MustParse("1024"), 120 | }, 121 | wantErr: false, 122 | }, 123 | } 124 | for _, tt := range tests { 125 | t.Run(tt.name, func(t *testing.T) { 126 | got, err := GetYARNAllocatedResource(tt.args.annotations) 127 | if (err != nil) != tt.wantErr { 128 | t.Errorf("GetNodeAllocated() error = %v, wantErr %v", err, tt.wantErr) 129 | return 130 | } 131 | if !reflect.DeepEqual(got, tt.want) { 132 | t.Errorf("GetYARNAllocatedResource() got = %v, want %v", got, tt.want) 133 | } 134 | }) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /README-zh_CN.md: -------------------------------------------------------------------------------- 1 |

2 |

Koordinator YARN Copilot

3 | Koordinator 4 |

5 | 6 | [![License](https://img.shields.io/github/license/koordinator-sh/koordinator.svg?color=4EB1BA&style=flat-square)](https://opensource.org/licenses/Apache-2.0) 7 | [![GitHub release](https://img.shields.io/github/v/release/koordinator-sh/yarn-copilot.svg?style=flat-square)](https://github.com/koordinator-sh/yarn-copilot/releases/latest) 8 | [![CI](https://img.shields.io/github/actions/workflow/status/koordinator-sh/yarn-copilot/ci.yaml?label=CI&logo=github&style=flat-square&branch=main)](https://github.com/koordinator-sh/yarn-copilot/actions/workflows/ci.yaml) 9 | [![Go Report Card](https://goreportcard.com/badge/github.com/koordinator-sh/yarn-copilot?style=flat-square)](https://goreportcard.com/report/github.com/koordinator-sh/yarn-copilot) 10 | [![codecov](https://img.shields.io/codecov/c/github/koordinator-sh/yarn-copilot?logo=codecov&style=flat-square)](https://codecov.io/github/koordinator-sh/yarn-copilot) 11 | [![PRs Welcome](https://badgen.net/badge/PRs/welcome/green?icon=https://api.iconify.design/octicon:git-pull-request.svg?color=white&style=flat-square)](CONTRIBUTING.md) 12 | [![Slack](https://badgen.net/badge/slack/join/4A154B?icon=slack&style=flat-square)](https://join.slack.com/t/koordinator-sh/shared_invite/zt-1756qoub4-Cn4~esfdlfAPsD7cwO2NzA) 13 | 14 | 15 | [English](./README.md) | 简体中文 16 | 17 | ## 介绍 18 | 19 | Koordinator已经支持了K8s生态内的在离线混部,通过Batch超卖资源以及BE QoS,离线任务可以使用到集群内的空闲资源,提升资源使用效率。然而, 20 | 在K8s生态外,仍有相当数量的应用运行在其他资源管理系统,例如Apache Hadoop YARN。作为大数据生态下的资源管理系统,YARN承载了包括MapReduce、 21 | Spark、Flink以及Presto等在内的多种计算引擎。 22 | 23 | 为了进一步丰富Koordinator支持的在离线混部场景,Koordinator社区提供了面向大数据场景的YARN混部套件`Koordinator YARN Copilot`, 24 | 用于支持Hadoop YARN应用与K8s混部,将Koordiantor的Batch资源提供给Hadoop YARN使用,进一步提升集群资源的使用效率。 25 | `Koordinator YARN Copilot`具备以下特点: 26 | 27 | - 面向开源生态:针对开源版本的Hadoop YARN实现,无需对YARN本身做侵入式改造。 28 | - 统一资源优先级和QoS策略:YARN混部套件完全对标Koordinator的Batch资源模型,同时接受单机一系列QoS策略的管控。 29 | - 节点资源共享:在同一节点上可以同时运行Batch类型的Pod和YARN的Task。 30 | - 适应多种环境:YARN混部套件对集群类型没有约束,可以在包括公共云、IDC等多种场景下使用。 31 | 32 | ## 快速开始 33 | 34 | 你可以在 [Koordinator website](https://koordinator.sh/docs) 查看到完整的文档集。 35 | 36 | - 安装/升级 Koordinator [最新版本](https://koordinator.sh/docs/installation) 37 | - 参考[最佳实践](https://koordinator.sh/zh-Hans/docs/next/best-practices/colocation-of-hadoop-yarn/),里面包含了关于K8s与YARN混部的详细示例。 38 | 39 | ## 行为守则 40 | 41 | Koordinator 社区遵照[行为守则](https://github.com/koordinator-sh/koordinator/CODE_OF_CONDUCT.md) 。我们鼓励每个人在参与之前先读一下它。 42 | 43 | 为了营造一个开放和热情的环境,我们作为贡献者和维护者承诺:无论年龄、体型、残疾、种族、经验水平、教育程度、社会经济地位、国籍、个人外貌、种族、宗教或性认同和性取向如何,参与我们的项目和社区的每个人都不会受到骚扰。 44 | 45 | ## 贡献 46 | 47 | 我们非常欢迎每一位社区同学共同参与 Koordinator 的建设,你可以从 [CONTRIBUTING.md](https://github.com/koordinator-sh/koordinator/CONTRIBUTING.md) 手册开始。 48 | 49 | ## 成员 50 | 51 | 我们鼓励所有贡献者成为成员。我们的目标是发展一个由贡献者、审阅者和代码所有者组成的活跃、健康的社区。在我们的[社区成员](https://github.com/koordinator-sh/community/blob/main/community-membership.md)页面,详细了解我们的成员要求和责任。 52 | 53 | ## 社区 54 | 55 | 在 [koordinator-sh/community 仓库](https://github.com/koordinator-sh/community) 中托管了所有社区信息, 例如成员制度、代码规范等。 56 | 57 | 我们鼓励所有贡献者成为成员。我们的目标是发展一个由贡献者、审阅者和代码所有者组成的活跃、健康的社区。 58 | 请在[社区成员制度](https://github.com/koordinator-sh/community/blob/main/community-membership.md)页面,详细了解我们的成员要求和责任。 59 | 60 | 活跃的社区途径: 61 | 62 | - 社区双周会(中文): 63 | - 周二 19:30 GMT+8 (北京时间) 64 | - [钉钉会议链接](https://meeting.dingtalk.com/j/cgTTojEI8Zy) 65 | - [议题&记录文档](https://shimo.im/docs/m4kMLdgO1LIma9qD) 66 | - Slack( English ): [koordinator channel](https://kubernetes.slack.com/channels/koordinator) in Kubernetes workspace 67 | - 钉钉( Chinese ): 搜索群ID `33383887`或者扫描二维码加入 68 | 69 |
70 | Dingtalk QRCode 71 |
72 | 73 | ## License 74 | 75 | Koordinator is licensed under the Apache License, Version 2.0. See [LICENSE](./LICENSE) for the full license text. 76 | 82 | 83 | ## 安全 84 | 对于发现的安全漏洞,请邮件发送至kubernetes-security@service.aliyun.com,您可在[SECURITY.md](./SECURITY.md)文件中找到更多信息。 85 | -------------------------------------------------------------------------------- /pkg/yarn/config/configuration.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2013 The Cloudera Inc. 3 | Copyright 2023 The Koordinator Authors. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | package conf 19 | 20 | import ( 21 | "encoding/xml" 22 | "io" 23 | "os" 24 | "strconv" 25 | 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | var ( 30 | CORE_DEFAULT Resource = Resource{"core-default.xml", false} 31 | CORE_SITE Resource = Resource{"core-site.xml", false} 32 | HDFS_DEFAULT Resource = Resource{"hdfs-default.xml", false} 33 | HDFS_SITE Resource = Resource{"hdfs-site.xml", false} 34 | ) 35 | 36 | type Resource struct { 37 | Name string 38 | Required bool 39 | } 40 | 41 | type Configuration interface { 42 | Get(key string, defaultValue string) (string, error) 43 | GetInt(key string, defaultValue int) (int, error) 44 | GetBool(key string, defaultValue bool) (bool, error) 45 | 46 | Set(key string, value string) error 47 | SetInt(key string, value int) error 48 | } 49 | 50 | type configuration struct { 51 | Properties map[string]string 52 | } 53 | 54 | type property struct { 55 | Name string `xml:"name"` 56 | Value string `xml:"value"` 57 | } 58 | 59 | type hadoopConfiguration struct { 60 | XMLName xml.Name `xml:"configuration"` 61 | Properties []property `xml:"property"` 62 | } 63 | 64 | func (conf *configuration) Get(key string, defaultValue string) (string, error) { 65 | value, exists := conf.Properties[key] 66 | if !exists { 67 | return defaultValue, nil 68 | } 69 | return value, nil 70 | } 71 | 72 | func (conf *configuration) GetInt(key string, defaultValue int) (int, error) { 73 | value, exists := conf.Properties[key] 74 | if !exists { 75 | return defaultValue, nil 76 | } 77 | return strconv.Atoi(value) 78 | } 79 | 80 | func (conf *configuration) GetBool(key string, defaultValue bool) (bool, error) { 81 | value, exists := conf.Properties[key] 82 | if !exists { 83 | return defaultValue, nil 84 | } 85 | return strconv.ParseBool(value) 86 | } 87 | 88 | func (conf *configuration) Set(key string, value string) error { 89 | conf.Properties[key] = value 90 | return nil 91 | } 92 | 93 | func (conf *configuration) SetInt(key string, value int) error { 94 | conf.Properties[key] = strconv.Itoa(value) 95 | return nil 96 | } 97 | 98 | func NewConfiguration(hadoopConfDir string) (Configuration, error) { 99 | return NewConfigurationResources(hadoopConfDir, []Resource{}, "") 100 | } 101 | 102 | func NewConfigurationResources(hadoopConfDir string, resources []Resource, prefix string) (Configuration, error) { 103 | // Add $HADOOP_CONF_DIR/core-default.xml & $HADOOP_CONF_DIR/core-site.xml 104 | resourcesWithDefault := []Resource{CORE_DEFAULT, CORE_SITE} 105 | resourcesWithDefault = append(resourcesWithDefault, resources...) 106 | 107 | c := configuration{Properties: make(map[string]string)} 108 | 109 | for _, resource := range resourcesWithDefault { 110 | conf, err := os.Open(hadoopConfDir + string(os.PathSeparator) + prefix + resource.Name) 111 | if err != nil { 112 | if !resource.Required { 113 | continue 114 | } 115 | klog.Warningf("Couldn't open resource: ", err) 116 | return nil, err 117 | } 118 | confData, err := io.ReadAll(conf) 119 | if err != nil { 120 | klog.Warningf("Couldn't read resource: ", err) 121 | return nil, err 122 | } 123 | defer conf.Close() 124 | 125 | // Parse 126 | var hConf hadoopConfiguration 127 | err = xml.Unmarshal(confData, &hConf) 128 | if err != nil { 129 | klog.Warningf("Couldn't parse core-site.xml: ", err) 130 | return nil, err 131 | } 132 | 133 | // Save into configuration 134 | for _, kv := range hConf.Properties { 135 | err = c.Set(kv.Name, kv.Value) 136 | if err != nil { 137 | return nil, err 138 | } 139 | } 140 | } 141 | 142 | return &c, nil 143 | } 144 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/HAServiceProtocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ha.proto"; 27 | option java_outer_classname = "HAServiceProtocolProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.common; 31 | 32 | enum HAServiceStateProto { 33 | INITIALIZING = 0; 34 | ACTIVE = 1; 35 | STANDBY = 2; 36 | OBSERVER = 3; 37 | } 38 | 39 | enum HARequestSource { 40 | REQUEST_BY_USER = 0; 41 | REQUEST_BY_USER_FORCED = 1; 42 | REQUEST_BY_ZKFC = 2; 43 | } 44 | 45 | message HAStateChangeRequestInfoProto { 46 | required HARequestSource reqSource = 1; 47 | } 48 | 49 | /** 50 | * void request 51 | */ 52 | message MonitorHealthRequestProto { 53 | } 54 | 55 | /** 56 | * void response 57 | */ 58 | message MonitorHealthResponseProto { 59 | } 60 | 61 | /** 62 | * void request 63 | */ 64 | message TransitionToActiveRequestProto { 65 | required HAStateChangeRequestInfoProto reqInfo = 1; 66 | } 67 | 68 | /** 69 | * void response 70 | */ 71 | message TransitionToActiveResponseProto { 72 | } 73 | 74 | /** 75 | * void request 76 | */ 77 | message TransitionToStandbyRequestProto { 78 | required HAStateChangeRequestInfoProto reqInfo = 1; 79 | } 80 | 81 | /** 82 | * void response 83 | */ 84 | message TransitionToStandbyResponseProto { 85 | } 86 | 87 | /** 88 | * void request 89 | */ 90 | message TransitionToObserverRequestProto { 91 | required HAStateChangeRequestInfoProto reqInfo = 1; 92 | } 93 | 94 | /** 95 | * void response 96 | */ 97 | message TransitionToObserverResponseProto { 98 | } 99 | 100 | /** 101 | * void request 102 | */ 103 | message GetServiceStatusRequestProto { 104 | } 105 | 106 | /** 107 | * Returns the state of the service 108 | */ 109 | message GetServiceStatusResponseProto { 110 | required HAServiceStateProto state = 1; 111 | 112 | // If state is STANDBY, indicate whether it is 113 | // ready to become active. 114 | optional bool readyToBecomeActive = 2; 115 | // If not ready to become active, a textual explanation of why not 116 | optional string notReadyReason = 3; 117 | } 118 | 119 | /** 120 | * Protocol interface provides High availability related 121 | * primitives to monitor and failover a service. 122 | * 123 | * For details see o.a.h.ha.HAServiceProtocol. 124 | */ 125 | service HAServiceProtocolService { 126 | /** 127 | * Monitor the health of a service. 128 | */ 129 | rpc monitorHealth(MonitorHealthRequestProto) 130 | returns(MonitorHealthResponseProto); 131 | 132 | /** 133 | * Request service to tranisition to active state. 134 | */ 135 | rpc transitionToActive(TransitionToActiveRequestProto) 136 | returns(TransitionToActiveResponseProto); 137 | 138 | /** 139 | * Request service to transition to standby state. 140 | */ 141 | rpc transitionToStandby(TransitionToStandbyRequestProto) 142 | returns(TransitionToStandbyResponseProto); 143 | 144 | /** 145 | * Request service to transition to observer state. 146 | */ 147 | rpc transitionToObserver(TransitionToObserverRequestProto) 148 | returns(TransitionToObserverResponseProto); 149 | 150 | /** 151 | * Get the current status of the service. 152 | */ 153 | rpc getServiceStatus(GetServiceStatusRequestProto) 154 | returns(GetServiceStatusResponseProto); 155 | } 156 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopyarn/server/yarn_server_resourcemanager_service_protos.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.yarn.proto"; 27 | option java_outer_classname = "YarnServerResourceManagerServiceProtos"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.yarn; 31 | 32 | import "yarn_protos.proto"; 33 | 34 | message RefreshQueuesRequestProto { 35 | } 36 | message RefreshQueuesResponseProto { 37 | } 38 | 39 | message RefreshNodesRequestProto { 40 | optional DecommissionTypeProto decommissionType = 1 [default = NORMAL]; 41 | optional int32 decommissionTimeout = 2; 42 | } 43 | message RefreshNodesResponseProto { 44 | } 45 | 46 | message RefreshSuperUserGroupsConfigurationRequestProto { 47 | } 48 | message RefreshSuperUserGroupsConfigurationResponseProto { 49 | } 50 | 51 | message RefreshUserToGroupsMappingsRequestProto { 52 | } 53 | message RefreshUserToGroupsMappingsResponseProto { 54 | } 55 | 56 | message RefreshAdminAclsRequestProto { 57 | } 58 | message RefreshAdminAclsResponseProto { 59 | } 60 | 61 | message RefreshServiceAclsRequestProto { 62 | } 63 | message RefreshServiceAclsResponseProto { 64 | } 65 | 66 | message GetGroupsForUserRequestProto { 67 | required string user = 1; 68 | } 69 | 70 | message GetGroupsForUserResponseProto { 71 | repeated string groups = 1; 72 | } 73 | 74 | message UpdateNodeResourceRequestProto { 75 | repeated NodeResourceMapProto node_resource_map = 1; 76 | } 77 | 78 | message UpdateNodeResourceResponseProto { 79 | } 80 | 81 | message RefreshNodesResourcesRequestProto { 82 | } 83 | 84 | message RefreshNodesResourcesResponseProto { 85 | } 86 | 87 | message AddToClusterNodeLabelsRequestProto { 88 | repeated string deprecatedNodeLabels = 1; 89 | repeated NodeLabelProto nodeLabels = 2; 90 | } 91 | 92 | message AddToClusterNodeLabelsResponseProto { 93 | } 94 | 95 | message RemoveFromClusterNodeLabelsRequestProto { 96 | repeated string nodeLabels = 1; 97 | } 98 | 99 | message RemoveFromClusterNodeLabelsResponseProto { 100 | } 101 | 102 | message ReplaceLabelsOnNodeRequestProto { 103 | repeated NodeIdToLabelsProto nodeToLabels = 1; 104 | optional bool failOnUnknownNodes = 2; 105 | } 106 | 107 | message ReplaceLabelsOnNodeResponseProto { 108 | } 109 | 110 | message UpdateNodeLabelsResponseProto { 111 | } 112 | 113 | message CheckForDecommissioningNodesRequestProto { 114 | } 115 | message CheckForDecommissioningNodesResponseProto { 116 | repeated NodeIdProto decommissioningNodes = 1; 117 | } 118 | 119 | message RefreshClusterMaxPriorityRequestProto { 120 | } 121 | message RefreshClusterMaxPriorityResponseProto { 122 | } 123 | 124 | message NodeIdToLabelsNameProto { 125 | optional NodeIdProto nodeId = 1; 126 | repeated string nodeLabels = 2; 127 | } 128 | 129 | enum DecommissionTypeProto { 130 | NORMAL = 1; 131 | GRACEFUL = 2; 132 | FORCEFUL = 3; 133 | } 134 | 135 | 136 | enum AttributeMappingOperationTypeProto { 137 | REPLACE = 1; 138 | ADD = 2; 139 | REMOVE = 3; 140 | } 141 | 142 | message NodesToAttributesMappingRequestProto { 143 | optional AttributeMappingOperationTypeProto operation = 1 [default = REPLACE]; 144 | repeated NodeToAttributesProto nodeToAttributes = 2; 145 | optional bool failOnUnknownNodes = 3; 146 | } 147 | 148 | message NodesToAttributesMappingResponseProto { 149 | } 150 | ////////////////////////////////////////////////////////////////// 151 | ///////////// RM Failover related records //////////////////////// 152 | ////////////////////////////////////////////////////////////////// 153 | message ActiveRMInfoProto { 154 | optional string clusterId = 1; 155 | optional string rmId = 2; 156 | } 157 | -------------------------------------------------------------------------------- /pkg/yarn/client/mockclient/mock_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | // 17 | 18 | // Code generated by MockGen. DO NOT EDIT. 19 | // Source: pkg/yarn/client/client.go 20 | 21 | // Package mock_client is a generated GoMock package. 22 | package mock_client 23 | 24 | import ( 25 | reflect "reflect" 26 | 27 | gomock "github.com/golang/mock/gomock" 28 | hadoopyarn "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 29 | server "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 30 | ) 31 | 32 | // MockYarnClient is a mock of YarnClient interface. 33 | type MockYarnClient struct { 34 | ctrl *gomock.Controller 35 | recorder *MockYarnClientMockRecorder 36 | } 37 | 38 | // MockYarnClientMockRecorder is the mock recorder for MockYarnClient. 39 | type MockYarnClientMockRecorder struct { 40 | mock *MockYarnClient 41 | } 42 | 43 | // NewMockYarnClient creates a new mock instance. 44 | func NewMockYarnClient(ctrl *gomock.Controller) *MockYarnClient { 45 | mock := &MockYarnClient{ctrl: ctrl} 46 | mock.recorder = &MockYarnClientMockRecorder{mock} 47 | return mock 48 | } 49 | 50 | // EXPECT returns an object that allows the caller to indicate expected use. 51 | func (m *MockYarnClient) EXPECT() *MockYarnClientMockRecorder { 52 | return m.recorder 53 | } 54 | 55 | // Close mocks base method. 56 | func (m *MockYarnClient) Close() { 57 | m.ctrl.T.Helper() 58 | m.ctrl.Call(m, "Close") 59 | } 60 | 61 | // Close indicates an expected call of Close. 62 | func (mr *MockYarnClientMockRecorder) Close() *gomock.Call { 63 | mr.mock.ctrl.T.Helper() 64 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockYarnClient)(nil).Close)) 65 | } 66 | 67 | // GetClusterNodes mocks base method. 68 | func (m *MockYarnClient) GetClusterNodes(request *hadoopyarn.GetClusterNodesRequestProto) (*hadoopyarn.GetClusterNodesResponseProto, error) { 69 | m.ctrl.T.Helper() 70 | ret := m.ctrl.Call(m, "GetClusterNodes", request) 71 | ret0, _ := ret[0].(*hadoopyarn.GetClusterNodesResponseProto) 72 | ret1, _ := ret[1].(error) 73 | return ret0, ret1 74 | } 75 | 76 | // GetClusterNodes indicates an expected call of GetClusterNodes. 77 | func (mr *MockYarnClientMockRecorder) GetClusterNodes(request interface{}) *gomock.Call { 78 | mr.mock.ctrl.T.Helper() 79 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetClusterNodes", reflect.TypeOf((*MockYarnClient)(nil).GetClusterNodes), request) 80 | } 81 | 82 | // Initialize mocks base method. 83 | func (m *MockYarnClient) Initialize() error { 84 | m.ctrl.T.Helper() 85 | ret := m.ctrl.Call(m, "Initialize") 86 | ret0, _ := ret[0].(error) 87 | return ret0 88 | } 89 | 90 | // Initialize indicates an expected call of Initialize. 91 | func (mr *MockYarnClientMockRecorder) Initialize() *gomock.Call { 92 | mr.mock.ctrl.T.Helper() 93 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Initialize", reflect.TypeOf((*MockYarnClient)(nil).Initialize)) 94 | } 95 | 96 | // Reinitialize mocks base method. 97 | func (m *MockYarnClient) Reinitialize() error { 98 | m.ctrl.T.Helper() 99 | ret := m.ctrl.Call(m, "Reinitialize") 100 | ret0, _ := ret[0].(error) 101 | return ret0 102 | } 103 | 104 | // Reinitialize indicates an expected call of Reinitialize. 105 | func (mr *MockYarnClientMockRecorder) Reinitialize() *gomock.Call { 106 | mr.mock.ctrl.T.Helper() 107 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reinitialize", reflect.TypeOf((*MockYarnClient)(nil).Reinitialize)) 108 | } 109 | 110 | // UpdateNodeResource mocks base method. 111 | func (m *MockYarnClient) UpdateNodeResource(request *server.UpdateNodeResourceRequestProto) (*server.UpdateNodeResourceResponseProto, error) { 112 | m.ctrl.T.Helper() 113 | ret := m.ctrl.Call(m, "UpdateNodeResource", request) 114 | ret0, _ := ret[0].(*server.UpdateNodeResourceResponseProto) 115 | ret1, _ := ret[1].(error) 116 | return ret0, ret1 117 | } 118 | 119 | // UpdateNodeResource indicates an expected call of UpdateNodeResource. 120 | func (mr *MockYarnClientMockRecorder) UpdateNodeResource(request interface{}) *gomock.Call { 121 | mr.mock.ctrl.T.Helper() 122 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateNodeResource", reflect.TypeOf((*MockYarnClient)(nil).UpdateNodeResource), request) 123 | } 124 | -------------------------------------------------------------------------------- /pkg/yarn/cache/nodes_syncer.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package cache 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "sync" 23 | "sync/atomic" 24 | "time" 25 | 26 | "k8s.io/klog/v2" 27 | 28 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 29 | yarnclient "github.com/koordinator-sh/yarn-copilot/pkg/yarn/client" 30 | ) 31 | 32 | const ( 33 | syncInterval = time.Second 34 | ) 35 | 36 | // YARN RM only supports get all nodes from cluster, sync to cache for efficiency 37 | type NodesSyncer struct { 38 | yarnClients map[string]yarnclient.YarnClient 39 | started atomic.Bool 40 | 41 | // > 42 | cache map[string]map[string]*hadoopyarn.NodeReportProto 43 | mtx sync.RWMutex 44 | } 45 | 46 | func NewNodesSyncer(yarnClients map[string]yarnclient.YarnClient) *NodesSyncer { 47 | return &NodesSyncer{ 48 | yarnClients: yarnClients, 49 | cache: map[string]map[string]*hadoopyarn.NodeReportProto{}, 50 | mtx: sync.RWMutex{}, 51 | } 52 | } 53 | 54 | func (r *NodesSyncer) GetNodeResource(yarnNode *YarnNode) (*hadoopyarn.NodeReportProto, bool) { 55 | if yarnNode == nil { 56 | return nil, false 57 | } 58 | key := r.getKey(yarnNode.Name, yarnNode.Port) 59 | r.mtx.RLock() 60 | defer r.mtx.RUnlock() 61 | clusterCache, exist := r.cache[yarnNode.ClusterID] 62 | if !exist { 63 | return nil, false 64 | } 65 | data, exist := clusterCache[key] 66 | return data, exist 67 | } 68 | 69 | func (r *NodesSyncer) getKey(yarnNodeName string, yarnNodePort int32) string { 70 | return fmt.Sprintf("%s-%d", yarnNodeName, yarnNodePort) 71 | } 72 | 73 | func (r *NodesSyncer) Start(ctx context.Context) error { 74 | t := time.NewTicker(syncInterval) 75 | debug := time.NewTicker(syncInterval * 10) 76 | go func() { 77 | for { 78 | select { 79 | case <-t.C: 80 | if err := r.syncYARNNodeAllocatedResource(); err != nil { 81 | klog.Errorf("sync yarn node allocated resource failed, error: %v", err) 82 | } else { 83 | r.started.Store(true) 84 | } 85 | case <-debug.C: 86 | r.debug() 87 | case <-ctx.Done(): 88 | klog.V(1).Infof("stop node syncer") 89 | return 90 | } 91 | } 92 | }() 93 | return nil 94 | } 95 | 96 | func (r *NodesSyncer) Started() bool { 97 | return r.started.Load() 98 | } 99 | 100 | func (r *NodesSyncer) debug() { 101 | r.mtx.RLock() 102 | defer r.mtx.RUnlock() 103 | for clusterID, clusterCache := range r.cache { 104 | for key, value := range clusterCache { 105 | klog.V(3).Infof("debug cache: %s %s %d %d %d %d", clusterID, key, *value.Used.VirtualCores, 106 | *value.Used.Memory, *value.Capability.VirtualCores, *value.Capability.Memory) 107 | } 108 | } 109 | } 110 | 111 | // GetYarnNodeInfo get yarn node info from cache, read only result 112 | // Warning: Do not edit any field of results 113 | func (r *NodesSyncer) GetYarnNodeInfo() map[string][]*hadoopyarn.NodeReportProto { 114 | r.mtx.RLock() 115 | defer r.mtx.RUnlock() 116 | res := map[string][]*hadoopyarn.NodeReportProto{} 117 | for clusterID, clusterCache := range r.cache { 118 | var data []*hadoopyarn.NodeReportProto 119 | for _, proto := range clusterCache { 120 | data = append(data, proto) 121 | } 122 | res[clusterID] = data 123 | } 124 | return res 125 | } 126 | 127 | func (r *NodesSyncer) syncYARNNodeAllocatedResource() error { 128 | req := hadoopyarn.GetClusterNodesRequestProto{NodeStates: []hadoopyarn.NodeStateProto{hadoopyarn.NodeStateProto_NS_RUNNING}} 129 | res := map[string]map[string]*hadoopyarn.NodeReportProto{} 130 | for id, yarnClient := range r.yarnClients { 131 | nodes, err := yarnClient.GetClusterNodes(&req) 132 | if err != nil { 133 | initErr := yarnClient.Reinitialize() 134 | return fmt.Errorf("GetClusterNodes error %v, reinitialize error %v", err, initErr) 135 | } 136 | if nodes == nil { 137 | continue 138 | } 139 | clusterCache := map[string]*hadoopyarn.NodeReportProto{} 140 | for _, reportProto := range nodes.GetNodeReports() { 141 | if reportProto.NodeId.Host == nil || reportProto.NodeId.Port == nil { 142 | klog.Warningf("got nil node from rm %v", id) 143 | continue 144 | } 145 | key := r.getKey(*reportProto.NodeId.Host, *reportProto.NodeId.Port) 146 | clusterCache[key] = reportProto 147 | } 148 | res[id] = clusterCache 149 | } 150 | r.mtx.Lock() 151 | defer r.mtx.Unlock() 152 | r.cache = res 153 | return nil 154 | } 155 | -------------------------------------------------------------------------------- /pkg/copilot-agent/runtime/runtime.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package runtime 18 | 19 | import ( 20 | "net" 21 | "net/http" 22 | "time" 23 | 24 | "github.com/go-resty/resty/v2" 25 | v1 "k8s.io/api/core/v1" 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | func GetAliveCopilots() map[string]CustomRuntimePlugin { 30 | if defaultManger == nil { 31 | return map[string]CustomRuntimePlugin{} 32 | } 33 | return defaultManger.GetAliveCopilots() 34 | } 35 | 36 | func GetCopilot(name string) (CustomRuntimePlugin, bool) { 37 | if defaultManger == nil { 38 | return nil, false 39 | } 40 | return defaultManger.GetCopilot(name) 41 | } 42 | 43 | type ContainerInfo struct { 44 | Name string `json:"name"` 45 | Namespace string `json:"namespace"` 46 | UID string `json:"uid"` 47 | Labels map[string]string `json:"labels"` 48 | Annotations map[string]string `json:"annotations"` 49 | CreateTimestamp time.Time `json:"createTimestamp"` 50 | 51 | CgroupDir string `json:"cgroupDir"` 52 | HostNetwork bool `json:"hostNetwork"` 53 | Resources v1.ResourceRequirements `json:"resources"` 54 | } 55 | 56 | type PluginInfo struct { 57 | Name string `json:"name"` 58 | Version string `json:"version"` 59 | } 60 | 61 | type KillRequest struct { 62 | ContainerID string `json:"containerID,omitempty"` 63 | Resources v1.ResourceList `json:"resources,omitempty"` 64 | } 65 | 66 | type KillInfo struct { 67 | Items []*ContainerInfo `json:"items,omitempty"` 68 | } 69 | 70 | type CustomRuntimePlugin interface { 71 | IsAlive() bool 72 | Info() (*PluginInfo, error) 73 | ListContainer() ([]*ContainerInfo, error) 74 | GetContainer(ContainerID string) (*ContainerInfo, error) 75 | KillContainer(killReq *KillRequest) (*KillInfo, error) 76 | KillContainersByResource(killReq *KillRequest) (*KillInfo, error) 77 | } 78 | 79 | type httpPlugin struct { 80 | client *resty.Client 81 | } 82 | 83 | func newHttpPlugin(socket string) *httpPlugin { 84 | 85 | transport := http.Transport{ 86 | Dial: func(_, _ string) (net.Conn, error) { 87 | return net.Dial("unix", socket) 88 | }, 89 | } 90 | 91 | // Create a Resty Client 92 | client := resty.New() 93 | 94 | // Set the previous transport that we created, set the scheme of the communication to the 95 | // socket and set the unixSocket as the HostURL. 96 | client.SetTransport(&transport).SetScheme("http").SetBaseURL("unixSocket") 97 | return &httpPlugin{ 98 | client: client, 99 | } 100 | } 101 | 102 | func (h *httpPlugin) IsAlive() bool { 103 | resp, err := h.client.R().Get("/health") 104 | if err != nil { 105 | klog.V(4).Info(err.Error()) 106 | return false 107 | } 108 | if resp.IsError() { 109 | klog.V(4).Info(resp.Error()) 110 | return false 111 | } 112 | klog.V(5).Infof("health from plugin, %+v", string(resp.Body())) 113 | return resp.IsSuccess() 114 | } 115 | 116 | func (h *httpPlugin) Info() (*PluginInfo, error) { 117 | var res *PluginInfo 118 | _, err := h.client.R(). 119 | SetResult(&res). 120 | Get("/information") 121 | if err != nil { 122 | return nil, err 123 | } 124 | return res, nil 125 | } 126 | 127 | func (h *httpPlugin) ListContainer() ([]*ContainerInfo, error) { 128 | var res []*ContainerInfo 129 | _, err := h.client.R().SetResult(&res).Get("/v1/containers") 130 | klog.V(5).Infof("list container from plugin, %+v", res) 131 | if err != nil { 132 | return nil, err 133 | } 134 | return res, nil 135 | } 136 | 137 | func (h *httpPlugin) GetContainer(ContainerID string) (*ContainerInfo, error) { 138 | var res *ContainerInfo 139 | _, err := h.client.R(). 140 | SetResult(&res). 141 | SetQueryParam("containerID", ContainerID). 142 | Get("/v1/container") 143 | klog.V(5).Infof("get container from plugin, %+v", res) 144 | if err != nil { 145 | return nil, err 146 | } 147 | return res, nil 148 | } 149 | 150 | func (h *httpPlugin) KillContainer(killReq *KillRequest) (*KillInfo, error) { 151 | var res *KillInfo 152 | _, err := h.client.R(). 153 | SetResult(&res). 154 | SetBody(killReq). 155 | Post("/v1/killContainer") 156 | klog.V(5).Infof("kill container from plugin, %+v", res) 157 | if err != nil { 158 | return nil, err 159 | } 160 | return res, nil 161 | } 162 | 163 | func (h *httpPlugin) KillContainersByResource(killReq *KillRequest) (*KillInfo, error) { 164 | var res *KillInfo 165 | _, err := h.client.R(). 166 | SetResult(&res). 167 | SetBody(killReq). 168 | Post("/v1/killContainersByResource") 169 | if err != nil { 170 | return nil, err 171 | } 172 | return res, nil 173 | } 174 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopyarn/applicationclient_protocol.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are public and stable. 21 | * Please see https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Compatibility.html 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.yarn.proto"; 27 | option java_outer_classname = "ApplicationClientProtocol"; 28 | option java_generic_services = true; 29 | option java_generate_equals_and_hash = true; 30 | package hadoop.yarn; 31 | 32 | import "Security.proto"; 33 | import "yarn_service_protos.proto"; 34 | 35 | service ApplicationClientProtocolService { 36 | rpc getNewApplication (GetNewApplicationRequestProto) returns (GetNewApplicationResponseProto); 37 | rpc getApplicationReport (GetApplicationReportRequestProto) returns (GetApplicationReportResponseProto); 38 | rpc submitApplication (SubmitApplicationRequestProto) returns (SubmitApplicationResponseProto); 39 | rpc failApplicationAttempt (FailApplicationAttemptRequestProto) returns (FailApplicationAttemptResponseProto); 40 | rpc forceKillApplication (KillApplicationRequestProto) returns (KillApplicationResponseProto); 41 | rpc getClusterMetrics (GetClusterMetricsRequestProto) returns (GetClusterMetricsResponseProto); 42 | rpc getApplications (GetApplicationsRequestProto) returns (GetApplicationsResponseProto); 43 | rpc getClusterNodes (GetClusterNodesRequestProto) returns (GetClusterNodesResponseProto); 44 | rpc getQueueInfo (GetQueueInfoRequestProto) returns (GetQueueInfoResponseProto); 45 | rpc getQueueUserAcls (GetQueueUserAclsInfoRequestProto) returns (GetQueueUserAclsInfoResponseProto); 46 | rpc getDelegationToken(hadoop.common.GetDelegationTokenRequestProto) returns (hadoop.common.GetDelegationTokenResponseProto); 47 | rpc renewDelegationToken(hadoop.common.RenewDelegationTokenRequestProto) returns (hadoop.common.RenewDelegationTokenResponseProto); 48 | rpc cancelDelegationToken(hadoop.common.CancelDelegationTokenRequestProto) returns (hadoop.common.CancelDelegationTokenResponseProto); 49 | rpc moveApplicationAcrossQueues(MoveApplicationAcrossQueuesRequestProto) returns (MoveApplicationAcrossQueuesResponseProto); 50 | rpc getApplicationAttemptReport (GetApplicationAttemptReportRequestProto) returns (GetApplicationAttemptReportResponseProto); 51 | rpc getApplicationAttempts (GetApplicationAttemptsRequestProto) returns (GetApplicationAttemptsResponseProto); 52 | rpc getContainerReport (GetContainerReportRequestProto) returns (GetContainerReportResponseProto); 53 | rpc getContainers (GetContainersRequestProto) returns (GetContainersResponseProto); 54 | rpc getNewReservation (GetNewReservationRequestProto) returns (GetNewReservationResponseProto); 55 | rpc submitReservation (ReservationSubmissionRequestProto) returns (ReservationSubmissionResponseProto); 56 | rpc updateReservation (ReservationUpdateRequestProto) returns (ReservationUpdateResponseProto); 57 | rpc deleteReservation (ReservationDeleteRequestProto) returns (ReservationDeleteResponseProto); 58 | rpc listReservations (ReservationListRequestProto) returns (ReservationListResponseProto); 59 | rpc getNodeToLabels (GetNodesToLabelsRequestProto) returns (GetNodesToLabelsResponseProto); 60 | rpc getLabelsToNodes (GetLabelsToNodesRequestProto) returns (GetLabelsToNodesResponseProto); 61 | rpc getClusterNodeLabels (GetClusterNodeLabelsRequestProto) returns (GetClusterNodeLabelsResponseProto); 62 | rpc updateApplicationPriority (UpdateApplicationPriorityRequestProto) returns (UpdateApplicationPriorityResponseProto); 63 | rpc signalToContainer(SignalContainerRequestProto) returns (SignalContainerResponseProto); 64 | rpc updateApplicationTimeouts (UpdateApplicationTimeoutsRequestProto) returns (UpdateApplicationTimeoutsResponseProto); 65 | rpc getResourceProfiles(GetAllResourceProfilesRequestProto) returns (GetAllResourceProfilesResponseProto); 66 | rpc getResourceProfile(GetResourceProfileRequestProto) returns (GetResourceProfileResponseProto); 67 | rpc getResourceTypeInfo(GetAllResourceTypeInfoRequestProto) returns (GetAllResourceTypeInfoResponseProto); 68 | rpc getClusterNodeAttributes (GetClusterNodeAttributesRequestProto) returns (GetClusterNodeAttributesResponseProto); 69 | rpc getAttributesToNodes (GetAttributesToNodesRequestProto) returns (GetAttributesToNodesResponseProto); 70 | rpc getNodesToAttributes (GetNodesToAttributesRequestProto) returns (GetNodesToAttributesResponseProto); 71 | } 72 | -------------------------------------------------------------------------------- /pkg/copilot-agent/server/server.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package server 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "net" 23 | "net/http" 24 | "os" 25 | "path/filepath" 26 | "time" 27 | 28 | "github.com/gin-gonic/gin" 29 | "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" 30 | v1 "k8s.io/api/core/v1" 31 | "k8s.io/klog/v2" 32 | 33 | "github.com/koordinator-sh/yarn-copilot/pkg/copilot-agent/nm" 34 | ) 35 | 36 | type YarnCopilotServer struct { 37 | mgr *nm.NodeMangerOperator 38 | unixPath string 39 | } 40 | 41 | func NewYarnCopilotServer(mgr *nm.NodeMangerOperator, unixPath string) *YarnCopilotServer { 42 | return &YarnCopilotServer{mgr: mgr, unixPath: unixPath} 43 | } 44 | 45 | func (y *YarnCopilotServer) Run(ctx context.Context) error { 46 | e := gin.New() 47 | e.GET("/health", y.Health) 48 | e.GET("/information", y.Information) 49 | e.GET("/v1/container", y.GetContainer) 50 | e.GET("/v1/containers", y.ListContainers) 51 | e.POST("/v1/killContainer", y.KillContainer) 52 | e.POST("/v1/killContainersByResource", y.KillContainerByResource) 53 | 54 | server := &http.Server{ 55 | Handler: e, 56 | } 57 | sockDir := filepath.Dir(y.unixPath) 58 | _ = os.MkdirAll(sockDir, os.ModePerm) 59 | if system.FileExists(y.unixPath) { 60 | _ = os.Remove(y.unixPath) 61 | } 62 | listener, err := net.Listen("unix", y.unixPath) 63 | if err != nil { 64 | fmt.Printf("Failed to listen UNIX socket: %v", err) 65 | os.Exit(1) 66 | } 67 | defer func() { 68 | _ = os.Remove(y.unixPath) 69 | }() 70 | go func() { 71 | _ = server.Serve(listener) 72 | }() 73 | //for { 74 | // select { 75 | // case <-ctx.Done(): 76 | // 77 | // } 78 | //} 79 | for range ctx.Done() { 80 | klog.Info("graceful shutdown") 81 | if err := server.Shutdown(ctx); err != nil { 82 | klog.Errorf("Server forced to shutdown: %v", err) 83 | return err 84 | } 85 | } 86 | return nil 87 | } 88 | 89 | func (y *YarnCopilotServer) Health(ctx *gin.Context) { 90 | ctx.JSON(http.StatusOK, "ok") 91 | } 92 | 93 | type PluginInfo struct { 94 | Name string `json:"name"` 95 | Version string `json:"version"` 96 | } 97 | 98 | func (y *YarnCopilotServer) Information(ctx *gin.Context) { 99 | ctx.JSON(http.StatusOK, &PluginInfo{ 100 | Name: "yarn", 101 | Version: "v1", 102 | }) 103 | } 104 | 105 | func (y *YarnCopilotServer) ListContainers(ctx *gin.Context) { 106 | listContainers, err := y.mgr.ListContainers() 107 | if err != nil { 108 | klog.Error(err) 109 | ctx.JSON(http.StatusBadRequest, err) 110 | return 111 | } 112 | res := make([]*ContainerInfo, 0, len(listContainers.Containers.Items)) 113 | for _, container := range listContainers.Containers.Items { 114 | if container.IsFinalState() { 115 | continue 116 | } 117 | res = append(res, ParseContainerInfo(&container, y.mgr)) 118 | } 119 | ctx.JSON(http.StatusOK, res) 120 | } 121 | 122 | func (y *YarnCopilotServer) GetContainer(ctx *gin.Context) { 123 | containerID := ctx.Query("containerID") 124 | container, err := y.mgr.GetContainer(containerID) 125 | if err != nil { 126 | ctx.JSON(http.StatusBadRequest, err) 127 | return 128 | } 129 | ctx.JSON(http.StatusOK, ParseContainerInfo(container, y.mgr)) 130 | } 131 | 132 | type KillRequest struct { 133 | ContainerID string `json:"containerID,omitempty"` 134 | Resources v1.ResourceList `json:"resources,omitempty"` 135 | } 136 | 137 | type KillInfo struct { 138 | Items []*ContainerInfo `json:"items,omitempty"` 139 | } 140 | 141 | type ContainerInfo struct { 142 | Name string `json:"name"` 143 | Namespace string `json:"namespace"` 144 | UID string `json:"uid"` 145 | Labels map[string]string `json:"labels"` 146 | Annotations map[string]string `json:"annotations"` 147 | Priority int32 `json:"priority"` 148 | CreateTimestamp time.Time `json:"createTimestamp"` 149 | 150 | CgroupDir string `json:"cgroupDir"` 151 | HostNetwork bool `json:"hostNetwork"` 152 | Resources v1.ResourceRequirements `json:"resources"` 153 | } 154 | 155 | func (y *YarnCopilotServer) KillContainer(ctx *gin.Context) { 156 | var kr KillRequest 157 | if err := ctx.BindJSON(&kr); err != nil { 158 | ctx.JSON(http.StatusBadRequest, err) 159 | return 160 | } 161 | container, err := y.mgr.GetContainer(kr.ContainerID) 162 | if err != nil { 163 | ctx.JSON(http.StatusBadRequest, err) 164 | return 165 | } 166 | if err := y.mgr.KillContainer(kr.ContainerID); err != nil { 167 | ctx.JSON(http.StatusBadRequest, err) 168 | return 169 | } 170 | ctx.JSON(http.StatusOK, KillInfo{Items: []*ContainerInfo{ParseContainerInfo(container, y.mgr)}}) 171 | } 172 | 173 | func (y *YarnCopilotServer) KillContainerByResource(ctx *gin.Context) { 174 | } 175 | -------------------------------------------------------------------------------- /cmd/yarn-operator/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "flag" 21 | "math/rand" 22 | "net/http" 23 | _ "net/http/pprof" 24 | "os" 25 | "time" 26 | 27 | "github.com/spf13/pflag" 28 | "k8s.io/client-go/rest" 29 | "k8s.io/client-go/tools/leaderelection/resourcelock" 30 | "k8s.io/klog/v2" 31 | "k8s.io/klog/v2/klogr" 32 | ctrl "sigs.k8s.io/controller-runtime" 33 | 34 | utilclient "github.com/koordinator-sh/koordinator/pkg/util/client" 35 | "github.com/koordinator-sh/koordinator/pkg/util/fieldindex" 36 | "github.com/koordinator-sh/yarn-copilot/cmd/yarn-operator/options" 37 | ) 38 | 39 | var ( 40 | setupLog = ctrl.Log.WithName("setup") 41 | 42 | restConfigQPS = flag.Int("rest-config-qps", 30, "QPS of rest config.") 43 | restConfigBurst = flag.Int("rest-config-burst", 50, "Burst of rest config.") 44 | ) 45 | 46 | func main() { 47 | var metricsAddr, pprofAddr string 48 | var healthProbeAddr string 49 | var enableLeaderElection, enablePprof bool 50 | var leaderElectionNamespace string 51 | var namespace string 52 | var syncPeriodStr string 53 | flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.") 54 | flag.StringVar(&healthProbeAddr, "health-probe-addr", ":8000", "The address the healthz/readyz endpoint binds to.") 55 | flag.BoolVar(&enableLeaderElection, "enable-leader-election", true, "Whether you need to enable leader election.") 56 | flag.StringVar(&leaderElectionNamespace, "leader-election-namespace", "koordinator-system", 57 | "This determines the namespace in which the leader election configmap will be created, it will use in-cluster namespace if empty.") 58 | flag.StringVar(&namespace, "namespace", "", 59 | "Namespace if specified restricts the manager's cache to watch objects in the desired namespace. Defaults to all namespaces.") 60 | flag.BoolVar(&enablePprof, "enable-pprof", true, "Enable pprof for controller manager.") 61 | flag.StringVar(&pprofAddr, "pprof-addr", ":8090", "The address the pprof binds to.") 62 | flag.StringVar(&syncPeriodStr, "sync-period", "", "Determines the minimum frequency at which watched resources are reconciled.") 63 | opts := options.NewOptions() 64 | opts.InitFlags(flag.CommandLine) 65 | //sloconfig.InitFlags(flag.CommandLine) 66 | //utilfeature.DefaultMutableFeatureGate.DefaultMutableFeatureGateAddFlag(pflag.CommandLine) 67 | klog.InitFlags(nil) 68 | pflag.CommandLine.AddGoFlagSet(flag.CommandLine) 69 | pflag.Parse() 70 | rand.Seed(time.Now().UnixNano()) 71 | ctrl.SetLogger(klogr.New()) 72 | // features.SetDefaultFeatureGates() 73 | 74 | if enablePprof { 75 | go func() { 76 | if err := http.ListenAndServe(pprofAddr, nil); err != nil { 77 | setupLog.Error(err, "unable to start pprof") 78 | } 79 | }() 80 | } 81 | 82 | cfg := ctrl.GetConfigOrDie() 83 | setRestConfig(cfg) 84 | cfg.UserAgent = "koordinator-yarn-operator" 85 | 86 | setupLog.Info("new clientset registry") 87 | //err := extclient.NewRegistry(cfg) 88 | //if err != nil { 89 | // setupLog.Error(err, "unable to init koordinator clientset and informer") 90 | // os.Exit(1) 91 | //} 92 | 93 | var syncPeriod *time.Duration 94 | if syncPeriodStr != "" { 95 | d, err := time.ParseDuration(syncPeriodStr) 96 | if err != nil { 97 | setupLog.Error(err, "invalid sync period flag") 98 | } else { 99 | syncPeriod = &d 100 | } 101 | } 102 | mgr, err := ctrl.NewManager(cfg, ctrl.Options{ 103 | Scheme: options.Scheme, 104 | MetricsBindAddress: metricsAddr, 105 | HealthProbeBindAddress: healthProbeAddr, 106 | LeaderElection: enableLeaderElection, 107 | LeaderElectionID: "koordinator-yarn-operator", 108 | LeaderElectionNamespace: leaderElectionNamespace, 109 | LeaderElectionResourceLock: resourcelock.ConfigMapsLeasesResourceLock, 110 | Namespace: namespace, 111 | SyncPeriod: syncPeriod, 112 | NewClient: utilclient.NewClient, 113 | }) 114 | if err != nil { 115 | setupLog.Error(err, "unable to start manager") 116 | os.Exit(1) 117 | } 118 | 119 | setupLog.Info("register field index") 120 | if err := fieldindex.RegisterFieldIndexes(mgr.GetCache()); err != nil { 121 | setupLog.Error(err, "failed to register field index") 122 | os.Exit(1) 123 | } 124 | 125 | if err := opts.ApplyTo(mgr); err != nil { 126 | setupLog.Error(err, "unable to setup controllers") 127 | os.Exit(1) 128 | } 129 | 130 | // +kubebuilder:scaffold:builder 131 | 132 | ctx := ctrl.SetupSignalHandler() 133 | 134 | setupLog.Info("starting manager") 135 | if err := mgr.Start(ctx); err != nil { 136 | setupLog.Error(err, "problem running manager") 137 | os.Exit(1) 138 | } 139 | } 140 | 141 | func setRestConfig(c *rest.Config) { 142 | if *restConfigQPS > 0 { 143 | c.QPS = float32(*restConfigQPS) 144 | } 145 | if *restConfigBurst > 0 { 146 | c.Burst = *restConfigBurst 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /hack/generate-yarn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2022 The Koordinator Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | GOYARN_PKG_ROOT="github.com/koordinator-sh/yarn-copilot" 22 | GOYARN_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 23 | GOYARN_API_PATH="${GOYARN_ROOT}/pkg/yarn/apis/proto" 24 | YARN_API_FILES="$( find ${GOYARN_API_PATH} -name "*.proto" )" 25 | echo ">> generate go pkgs for yarn proto files in ${GOYARN_API_PATH}" 26 | echo ">> api file names: ${YARN_API_FILES}" 27 | 28 | # input: "./hack/../pkg/yarn/apis/proto/hadoopyarn/server/resourcemanager_administration_protocol.proto" 29 | function generate_code() { 30 | yarn_api_proto=${1} 31 | file_name="$( echo ${yarn_api_proto} | grep -Eo "[a-z,_,A-Z]*.proto$" )" # resourcemanager_administration_protocol.proto 32 | yarn_api_out_path=${yarn_api_proto%/*.proto} # "./hack/../pkg/yarn/apis/proto/hadoopyarn/server 33 | 34 | file_pkg_map_str="$( generate_import_files_pkg_map ${file_name} )" 35 | file_pkg_map=($file_pkg_map_str) 36 | echo ">> api file ${file_name} has related pkg:" "${file_pkg_map[@]}" 37 | 38 | # --go_opt=MSecurity.proto=github.com/koordinator-sh/koordiantor/pkg/yarn/apis/proto/hadoopcommon 39 | # --go-grpc_opt=MSecurity.proto=github.com/koordinator-sh/koordiantor/pkg/yarn/apis/proto/hadoopcommon 40 | PKG_NAME_ARGS="" 41 | RPC_PKG_NAME_ARGS="" 42 | for file_pkg_kv in "${file_pkg_map[@]}" 43 | do 44 | file_name="${file_pkg_kv%%:*}" # 45 | pkg_name="${file_pkg_kv##*:}" 46 | PKG_NAME_ARGS+="--go_opt=M${file_name}=${pkg_name} " 47 | RPC_PKG_NAME_ARGS+="--go-grpc_opt=M${file_name}=${pkg_name} " 48 | done 49 | 50 | #proto_cmd="protoc ${PROTO_PATH_ARGS} ${PKG_NAME_ARGS} --go_opt=paths=source_relative --go_out=${yarn_api_out_path} --go-grpc_opt=paths=source_relative ${RPC_PKG_NAME_ARGS} --go-grpc_out=${yarn_api_out_path} ${yarn_api_proto}" 51 | proto_cmd="protoc ${PROTO_PATH_ARGS} ${PKG_NAME_ARGS} --go_opt=paths=source_relative --go_out=${yarn_api_out_path} ${yarn_api_proto}" 52 | 53 | echo ">> ready to generate for ${file_name}, protoc command:" 54 | echo "${proto_cmd}" 55 | ${proto_cmd} 56 | } 57 | 58 | 59 | # input: resourcemanager_administration_protocol.proto or server/yarn_server_resourcemanager_service_protos.proto 60 | # output: (yarn_server_resourcemanager_service_protos.proto:koordinator-sh/yarn-copilot/apis/proto/hadoopyarn/server) 61 | function generate_import_files_pkg_map() { 62 | input_file_name=${1} 63 | file_name="$( echo "${input_file_name}" | grep -Eo "[a-z,_,A-Z]*.proto$" )" # resourcemanager_administration_protocol.proto 64 | file_path="$( find ${GOYARN_API_PATH} -name ${file_name} )" 65 | 66 | file_pkg_map=() 67 | file_relative_path=${file_path##${GOYARN_ROOT}} # /pkg/yarn/apis/proto/hadoopyarn/server/resourcemanager_administration_protocol.proto 68 | pkg_relative_path=${file_relative_path%/*.proto} # /pkg/yarn/apis/proto/hadoopyarn/server 69 | file_pkg=${GOYARN_PKG_ROOT}${pkg_relative_path} # github.com/koordiantor-sh/koordiantor/pkg/yarn/apis/proto/hadoopyarn/server 70 | file_pkg_map+=("${input_file_name}:${file_pkg}") 71 | 72 | # server/yarn_server_resourcemanager_service_protos.proto 73 | import_paths=("$( grep -E "import \".*.proto\";" ${file_path} | grep -Eo "\".*\"" | sed "s/\"//g" )") 74 | if [ ! -z "${import_paths}" ]; then 75 | for import_path in ${import_paths[@]} 76 | do 77 | import_file_name="$( echo ${import_path} | grep -Eo "[a-z,_,A-Z]*.proto$" )" # yarn_server_resourcemanager_service_protos.proto 78 | import_file_path="$( find ${GOYARN_API_PATH} -name ${import_file_name} )" # ./hack/../pkg/yarn/apis/proto/hadoopyarn/server/yarn_server_resourcemanager_service_protos.proto 79 | file_pkg_map+=("$( generate_import_files_pkg_map ${import_path} )") 80 | done 81 | fi 82 | echo "${file_pkg_map[@]}" 83 | } 84 | 85 | yarn_api_pkg_relative_path_list=() 86 | for yarn_api_file in ${YARN_API_FILES} 87 | do 88 | # e.g. /pkg/yarn/apis/proto/hadoopcommon/IpcConnectionContext.proto 89 | yarn_api_file_relative_path=${yarn_api_file##${GOYARN_ROOT}} 90 | # e.g. /pkg/yarn/apis/proto/hadoopcommon 91 | yarn_api_pkg_relative_path=${yarn_api_file_relative_path%/*.proto} 92 | yarn_api_pkg_relative_path_list+=("${yarn_api_pkg_relative_path}") 93 | done 94 | # from lower to update to get the shortest go module args 95 | yarn_api_pkg_relative_path_list=($(echo "${yarn_api_pkg_relative_path_list[@]}" | tr ' ' '\n' | sort -ur | tr '\n' ' ')) 96 | echo ">> proto api file paths: " "${yarn_api_pkg_relative_path_list[@]}" 97 | 98 | # --proto_path=./hack/../pkg/yarn/apis/proto/hadoopcommon --proto_path=./hack/../pkg/yarn/apis/proto/hadoopyarn --proto_path=./hack/../pkg/yarn/apis/proto/hadoopyarn/server 99 | PROTO_PATH_ARGS="" 100 | for yarn_api_pkg_relative_path in "${yarn_api_pkg_relative_path_list[@]}" 101 | do 102 | PROTO_PATH_ARGS+="--proto_path=${GOYARN_ROOT}${yarn_api_pkg_relative_path} " 103 | done 104 | 105 | # generate go pkg for each api file 106 | for api_file_path in $YARN_API_FILES 107 | do 108 | api_file_name="$( echo "${api_file_path}" | grep -Eo "[a-z,_,A-Z]*.proto$" )" 109 | echo ">> generate go pkg for ${api_file_name}" 110 | generate_code ${api_file_path} 111 | done 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

Koordinator YARN Copilot

3 | Koordinator 4 |

5 | 6 | [![License](https://img.shields.io/github/license/koordinator-sh/koordinator.svg?color=4EB1BA&style=flat-square)](https://opensource.org/licenses/Apache-2.0) 7 | [![GitHub release](https://img.shields.io/github/v/release/koordinator-sh/yarn-copilot.svg?style=flat-square)](https://github.com/koordinator-sh/yarn-copilot/releases/latest) 8 | [![CI](https://img.shields.io/github/actions/workflow/status/koordinator-sh/yarn-copilot/ci.yaml?label=CI&logo=github&style=flat-square&branch=main)](https://github.com/koordinator-sh/yarn-copilot/actions/workflows/ci.yaml) 9 | [![Go Report Card](https://goreportcard.com/badge/github.com/koordinator-sh/yarn-copilot?style=flat-square)](https://goreportcard.com/report/github.com/koordinator-sh/yarn-copilot) 10 | [![codecov](https://img.shields.io/codecov/c/github/koordinator-sh/yarn-copilot?logo=codecov&style=flat-square)](https://codecov.io/github/koordinator-sh/yarn-copilot) 11 | [![PRs Welcome](https://badgen.net/badge/PRs/welcome/green?icon=https://api.iconify.design/octicon:git-pull-request.svg?color=white&style=flat-square)](CONTRIBUTING.md) 12 | [![Slack](https://badgen.net/badge/slack/join/4A154B?icon=slack&style=flat-square)](https://join.slack.com/t/koordinator-sh/shared_invite/zt-1756qoub4-Cn4~esfdlfAPsD7cwO2NzA) 13 | 14 | 15 | English | [简体中文](./README-zh_CN.md) 16 | ## Introduction 17 | 18 | Koordinator has supported hybrid orchestration workloads on Kubernetes, so that batch jobs can use the requested but unused resource 19 | as `koord-batch` priority and `BE` QoS class to improve the cluster utilization. However, there are still lots of applications 20 | running beyond K8s such as Apache Haddop YARN. As a resource management platform in BigData ecosystem, YARN has supported 21 | numbers of computing engines including MapReduce, Spark, Flink, Presto, etc. 22 | 23 | In order to extend the co-location scenario of Koordinator, now the community has provided Hadoop YARN extended suits 24 | `Koordinator YARN Copilot` in BigData ecosystem, supporting running Hadoop YARN jobs by `koord-batch` resources with 25 | other K8s pods. The `Koordinator YARN Copilot` has following characteristics: 26 | 27 | - Open-Source native: implement against open-sourced version of Hadoop YARN; so there is no hack inside YARN modules. 28 | - Unifed resource priority and QoS strategy: the suits aims to the `koord-batch` priority of Koordinator, and also managed by QoS strategies of koordlet. 29 | - Resource sharing on node level: node resources of `koord-batch` priority can be requested by tasks of YARN or `Batch` pods both. 30 | - Adaptive for multiple environments: the suits can be run under any environment, including public cloud or IDC. 31 | 32 | ## Quick Start 33 | 34 | You can view the full documentation from the [Koordinator website](https://koordinator.sh/docs). 35 | 36 | - Install or upgrade Koordinator with [the latest version](https://koordinator.sh/docs/installation). 37 | - Referring to [best practices](https://koordinator.sh/docs/next/best-practices/colocation-of-hadoop-yarn), there will be 38 | detailed instructions for running Hadoop YARN jobs with Koordinator batch resources in K8s. 39 | 40 | ## Code of conduct 41 | 42 | The Koordinator community is guided by our [Code of Conduct](https://github.com/koordinator-sh/koordinator/CODE_OF_CONDUCT.md), 43 | which we encourage everybody to read before participating. 44 | 45 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making 46 | participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, 47 | disability, ethnicity, level of experience, education, socio-economic status, 48 | nationality, personal appearance, race, religion, or sexual identity and orientation. 49 | 50 | ## Contributing 51 | 52 | You are warmly welcome to hack on Koordinator. We have prepared a detailed guide [CONTRIBUTING.md](https://github.com/koordinator-sh/koordinator/ONTRIBUTING.md). 53 | 54 | ## Community 55 | 56 | The [koordinator-sh/community repository](https://github.com/koordinator-sh/community) hosts all information about 57 | the community, membership and how to become them, developing inspection, who to contact about what, etc. 58 | 59 | We encourage all contributors to become members. We aim to grow an active, healthy community of contributors, reviewers, 60 | and code owners. Learn more about requirements and responsibilities of membership in 61 | the [community membership](https://github.com/koordinator-sh/community/blob/main/community-membership.md) page. 62 | 63 | Active communication channels: 64 | 65 | - Bi-weekly Community Meeting (APAC, *Chinese*): 66 | - Tuesday 19:30 GMT+8 (Asia/Shanghai) 67 | - [Meeting Link(DingTalk)](https://meeting.dingtalk.com/j/cgTTojEI8Zy) 68 | - [Notes and agenda](https://shimo.im/docs/m4kMLdgO1LIma9qD) 69 | - Slack(English): [koordinator channel](https://kubernetes.slack.com/channels/koordinator) in Kubernetes workspace 70 | - DingTalk(Chinese): Search Group ID `33383887` or scan the following QR Code 71 | 72 |
73 | Dingtalk QRCode 74 |
75 | 76 | ## License 77 | 78 | Koordinator is licensed under the Apache License, Version 2.0. See [LICENSE](./LICENSE) for the full license text. 79 | 85 | 86 | ## Security 87 | Please report vulnerabilities by email to kubernetes-security@service.aliyun.com. Also see our [SECURITY.md](./SECURITY.md) file for details. 88 | -------------------------------------------------------------------------------- /pkg/yarn/client/client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | "fmt" 21 | "k8s.io/klog/v2" 22 | 23 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 24 | "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn" 25 | yarnserver "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopyarn/server" 26 | yarnconf "github.com/koordinator-sh/yarn-copilot/pkg/yarn/config" 27 | ) 28 | 29 | type YarnClient interface { 30 | Initialize() error 31 | Reinitialize() error 32 | Close() 33 | UpdateNodeResource(request *yarnserver.UpdateNodeResourceRequestProto) (*yarnserver.UpdateNodeResourceResponseProto, error) 34 | GetClusterNodes(request *hadoopyarn.GetClusterNodesRequestProto) (*hadoopyarn.GetClusterNodesResponseProto, error) 35 | } 36 | 37 | var _ YarnClient = &yarnClient{} 38 | 39 | type yarnClient struct { 40 | confDir string 41 | conf yarnconf.YarnConfiguration 42 | haEnabled bool 43 | activeRMAdminAddress *string 44 | activeRMAddress *string 45 | clusterID string 46 | } 47 | 48 | func NewYarnClient(confDir string, clusterID string) YarnClient { 49 | return &yarnClient{confDir: confDir, clusterID: clusterID} 50 | } 51 | 52 | func (c *yarnClient) Initialize() error { 53 | if conf, err := yarnconf.NewYarnConfiguration(c.confDir, c.clusterID); err == nil { 54 | // TODO use flags for conf dir config 55 | c.conf = conf 56 | } else { 57 | return err 58 | } 59 | 60 | if ha, err := c.conf.GetRMEnabledHA(); err == nil { 61 | c.haEnabled = ha 62 | } else { 63 | return err 64 | } 65 | 66 | // ha not enabled, use default conf 67 | if !c.haEnabled { 68 | if rmAdminAddr, err := c.conf.GetRMAdminAddress(); err == nil { 69 | c.activeRMAdminAddress = &rmAdminAddr 70 | } else { 71 | return err 72 | } 73 | if rmAddr, err := c.conf.GetRMAddress(); err == nil { 74 | c.activeRMAddress = &rmAddr 75 | } else { 76 | return err 77 | } 78 | return nil 79 | } 80 | 81 | // ha enabled, get active rm address by id 82 | var activeRMID string 83 | var err error 84 | if activeRMID, err = c.GetActiveRMID(); err != nil { 85 | return err 86 | } 87 | if rmAdminAddr, err := c.conf.GetRMAdminAddressByID(activeRMID); err == nil { 88 | c.activeRMAdminAddress = &rmAdminAddr 89 | } else { 90 | return err 91 | } 92 | if rmAddress, err := c.conf.GetRMAddressByID(activeRMID); err == nil { 93 | c.activeRMAddress = &rmAddress 94 | } else { 95 | return err 96 | } 97 | 98 | return nil 99 | } 100 | 101 | func (c *yarnClient) Close() { 102 | c.activeRMAdminAddress = nil 103 | c.activeRMAddress = nil 104 | } 105 | 106 | func (c *yarnClient) Reinitialize() error { 107 | c.Close() 108 | return c.Initialize() 109 | } 110 | 111 | func (c *yarnClient) UpdateNodeResource(request *yarnserver.UpdateNodeResourceRequestProto) (*yarnserver.UpdateNodeResourceResponseProto, error) { 112 | if c.activeRMAdminAddress == nil && c.haEnabled { 113 | if err := c.Initialize(); err != nil { 114 | return nil, err 115 | } 116 | } 117 | // TODO check response error code and retry auto 118 | return c.updateNodeResource(request) 119 | } 120 | 121 | func (c *yarnClient) GetClusterNodes(request *hadoopyarn.GetClusterNodesRequestProto) (*hadoopyarn.GetClusterNodesResponseProto, error) { 122 | if c.activeRMAdminAddress == nil && c.haEnabled { 123 | if err := c.Initialize(); err != nil { 124 | return nil, err 125 | } 126 | } 127 | // TODO check response error code and retry auto 128 | return c.getClusterNodes(request) 129 | } 130 | 131 | func (c *yarnClient) GetActiveRMID() (string, error) { 132 | rmIDs, err := c.conf.GetRMs() 133 | if err != nil { 134 | return "", err 135 | } 136 | for _, rmID := range rmIDs { 137 | rmAdminAddr, err := c.conf.GetRMAdminAddressByID(rmID) 138 | if err != nil { 139 | return "", err 140 | } 141 | haClient, err := CreateYarnHAClient(rmAdminAddr) 142 | if err != nil { 143 | return "", fmt.Errorf("create yarn %v ha client for %v failed %v", rmID, rmAdminAddr, err) 144 | } 145 | resp, err := haClient.GetServiceStatus(&hadoopcommon.GetServiceStatusRequestProto{}) 146 | if err != nil { 147 | klog.V(4).Infof("get %v service status for %v failed %v, try next rm", rmID, rmAdminAddr, err) 148 | continue 149 | } 150 | if resp.State != nil && *resp.State == hadoopcommon.HAServiceStateProto_ACTIVE { 151 | return rmID, nil 152 | } 153 | } 154 | return "", fmt.Errorf("active rm not found in %v", rmIDs) 155 | } 156 | 157 | func (c *yarnClient) updateNodeResource(request *yarnserver.UpdateNodeResourceRequestProto) (*yarnserver.UpdateNodeResourceResponseProto, error) { 158 | // TODO keep client alive instead of create every time 159 | adminClient, err := CreateYarnAdminClient(c.conf, c.activeRMAdminAddress) 160 | if err != nil { 161 | return nil, err 162 | } 163 | return adminClient.UpdateNodeResource(request) 164 | } 165 | 166 | func (c *yarnClient) getClusterNodes(request *hadoopyarn.GetClusterNodesRequestProto) (*hadoopyarn.GetClusterNodesResponseProto, error) { 167 | // TODO keep client alive instead of create every time 168 | applicationClient, err := CreateYarnApplicationClient(c.conf, c.activeRMAddress) 169 | if err != nil { 170 | return nil, err 171 | } 172 | return applicationClient.GetClusterNode(request) 173 | } 174 | -------------------------------------------------------------------------------- /pkg/yarn/apis/security/digestmd5.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2013 The Cloudera Inc. 3 | Copyright 2023 The Koordinator Authors. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | package security 19 | 20 | import ( 21 | "crypto/md5" 22 | "crypto/rand" 23 | "encoding/base64" 24 | "encoding/hex" 25 | "errors" 26 | "strings" 27 | 28 | "k8s.io/klog/v2" 29 | 30 | hadoop_common "github.com/koordinator-sh/yarn-copilot/pkg/yarn/apis/proto/hadoopcommon" 31 | ) 32 | 33 | func getChallengeParams(challenge string) (map[string]string, error) { 34 | challengeParams := make(map[string]string) 35 | splits := strings.Split(string(challenge), ",") 36 | 37 | for _, split := range splits { 38 | //split on first '=' 39 | keyVal := strings.SplitN(split, "=", 2) 40 | 41 | if len(keyVal) != 2 { 42 | klog.Warningf("found invalid param: ", split) 43 | return nil, errors.New("found invalid param: " + split) 44 | } 45 | 46 | key := keyVal[0] 47 | value := keyVal[1] 48 | 49 | //some challenge params are quoted (realm, nonce, qop). 50 | //strip these out. 51 | quote := "\"" 52 | if strings.HasPrefix(value, quote) && strings.HasSuffix(value, quote) { 53 | value = value[len(quote):] 54 | value = value[:len(value)-len(quote)] 55 | } 56 | 57 | challengeParams[key] = value 58 | } 59 | 60 | return challengeParams, nil 61 | } 62 | 63 | // we only support a very specific digest-md5 mechanism for the moment 64 | // multiple realm, qop not supported 65 | func validateChallengeParameters(challengeParams map[string]string) error { 66 | var errString string 67 | 68 | realm, exists := challengeParams["realm"] 69 | if !exists || len(realm) == 0 { 70 | errString += "missing or invalid realm. " 71 | } 72 | 73 | nonce, exists := challengeParams["nonce"] 74 | if !exists || len(nonce) == 0 { 75 | errString += "missing or invalid nonce. " 76 | } 77 | 78 | qop, exists := challengeParams["qop"] 79 | if !exists || qop != "auth" { 80 | errString += "missing, invalid or unsupported qop. " 81 | } 82 | 83 | charset, exists := challengeParams["charset"] 84 | if !exists || charset != "utf-8" { 85 | errString += "missing, invalid or unsupported charset. " 86 | } 87 | 88 | algorithm, exists := challengeParams["algorithm"] 89 | if !exists || algorithm != "md5-sess" { 90 | errString += "missing, invalid or unsupported algorithm. " 91 | } 92 | 93 | if len(errString) > 0 { 94 | return errors.New(errString) 95 | } 96 | 97 | return nil 98 | } 99 | 100 | func generateChallengeReponse(username string, password string, protocol string, serverId string, challengeParams map[string]string) (string, error) { 101 | buffer := make([]string, 0, 128) 102 | 103 | charset := "charset=utf-8" 104 | quote := "\"" 105 | comma := "," 106 | maxbuf := "maxbuf=65536" 107 | nonceCount := "nc=00000001" 108 | nonceCountHex := "00000001" 109 | 110 | realm := challengeParams["realm"] 111 | nonce := challengeParams["nonce"] 112 | qop := challengeParams["qop"] 113 | digestUri := protocol + "/" + serverId 114 | method := "AUTHENTICATE" 115 | 116 | buffer = append(buffer, charset, comma) 117 | buffer = append(buffer, "username=", quote, username, quote, comma) 118 | buffer = append(buffer, "realm=", quote, realm, quote, comma) 119 | buffer = append(buffer, "nonce=", quote, nonce, quote, comma) 120 | buffer = append(buffer, nonceCount, comma) //nonce count is one 121 | 122 | //generate a response nonce 123 | count := 30 124 | nonceBuffer := make([]byte, count) 125 | _, err := rand.Read(nonceBuffer) 126 | if err != nil { 127 | return "", err 128 | } 129 | encodedNonce := base64.StdEncoding.EncodeToString(nonceBuffer) 130 | buffer = append(buffer, "cnonce=", quote, encodedNonce, quote, comma) 131 | 132 | buffer = append(buffer, "digest-uri=", quote, digestUri, quote, comma) 133 | buffer = append(buffer, maxbuf, comma) 134 | 135 | //for the md5-sess/qop=auth case, the computation is : 136 | //HA1=MD5(MD5(username:realm:password):nonce:cnonce) 137 | //HA2=MD5(method:digestURI)// 138 | //response=MD5(HA1:nonce:nonceCount:clientNonce:qop:HA2) 139 | 140 | hashSize := 16 //16 bytes 141 | ha1Part1 := username + ":" + realm + ":" + password 142 | ha1Part2 := ":" + nonce + ":" + encodedNonce 143 | ha1Part1md5 := md5.Sum([]byte(ha1Part1)) 144 | ha1Input := string(ha1Part1md5[:hashSize]) + ha1Part2 145 | 146 | HA1 := md5.Sum([]byte(ha1Input)) 147 | HA2 := md5.Sum([]byte(method + ":" + digestUri)) 148 | 149 | ha1Hex := hex.EncodeToString(HA1[:hashSize]) 150 | ha2Hex := hex.EncodeToString(HA2[:hashSize]) 151 | 152 | responseHashInput := string(ha1Hex + ":" + nonce + ":" + nonceCountHex + ":" + encodedNonce + ":" + qop + ":" + ha2Hex) 153 | responseHash := md5.Sum([]byte(responseHashInput)) 154 | responseHashHex := hex.EncodeToString(responseHash[:hashSize]) 155 | //end digest-md5 computation 156 | 157 | buffer = append(buffer, "response=", responseHashHex, comma) 158 | buffer = append(buffer, "qop=", qop) 159 | response := strings.Join(buffer, "") 160 | 161 | klog.V(5).Infof("generated challenge response: %s", response) 162 | 163 | return response, nil 164 | } 165 | 166 | func GetDigestMD5ChallengeResponse(protocol string, serverId string, challenge []byte, userToken *hadoop_common.TokenProto) (string, error) { 167 | if len(challenge) <= 0 { 168 | klog.Warningf("challenge cannot be empty!") 169 | 170 | return "", errors.New("challenge cannot be empty!") 171 | } 172 | 173 | var err error 174 | 175 | challengeParams, err := getChallengeParams(string(challenge)) 176 | if err != nil { 177 | klog.Warningf("challenge params extraction failure! ", err) 178 | return "", err 179 | } 180 | 181 | err = validateChallengeParameters(challengeParams) 182 | if err != nil { 183 | klog.Warningf("challenge params validation failure! ", err) 184 | return "", err 185 | } 186 | 187 | username := base64.StdEncoding.EncodeToString(userToken.GetIdentifier()) 188 | password := base64.StdEncoding.EncodeToString(userToken.GetPassword()) 189 | response, err := generateChallengeReponse(username, password, protocol, serverId, challengeParams) 190 | 191 | if err != nil { 192 | klog.Warningf("Failed to generate challenge response! ", err) 193 | return "", err 194 | } 195 | 196 | return response, nil 197 | } 198 | -------------------------------------------------------------------------------- /pkg/yarn/config/yarn_configuration.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2013 The Cloudera Inc. 3 | Copyright 2023 The Koordinator Authors. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | */ 17 | 18 | package conf 19 | 20 | import ( 21 | "fmt" 22 | "strings" 23 | ) 24 | 25 | var ( 26 | YARN_DEFAULT Resource = Resource{"yarn-default.xml", false} 27 | YARN_SITE Resource = Resource{"yarn-site.xml", true} 28 | ) 29 | 30 | const ( 31 | // yarn-site 32 | YARN_PREFIX = "yarn." 33 | RM_PREFIX = YARN_PREFIX + "resourcemanager." 34 | NM_PREFIX = YARN_PREFIX + "nodemanager." 35 | 36 | RM_ADDRESS = RM_PREFIX + "address" 37 | RM_SCHEDULER_ADDRESS = RM_PREFIX + "scheduler.address" 38 | RM_ADMIN_ADDRESS = RM_PREFIX + "admin.address" 39 | RM_HA_ENABLED = RM_PREFIX + "ha.enabled" 40 | RM_HA_RM_IDS = RM_PREFIX + "ha.rm-ids" 41 | RM_AM_EXPIRY_INTERVAL_MS = YARN_PREFIX + "am.liveness-monitor.expiry-interval-ms" 42 | 43 | RM_KEYTAB = RM_PREFIX + "keytab" 44 | NM_KEYTAB = NM_PREFIX + "keytab" 45 | 46 | RM_PRINCIPLE = RM_PREFIX + "principal" 47 | 48 | DEFAULT_RM_ADDRESS = "0.0.0.0:8032" 49 | DEFAULT_RM_SCHEDULER_ADDRESS = "0.0.0.0:8030" 50 | DEFAULT_RM_ADMIN_ADDRESS = "0.0.0.0:8033" 51 | DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = 600000 52 | DEFAULT_RM_HA_ENABLED = false 53 | 54 | DEFAULT_RM_KEYTAB = "/etc/krb5.keytab" // https://hadoop.apache.org/docs/r2.7.3/hadoop-yarn/hadoop-yarn-common/yarn-default.xml 55 | DEFAULT_RM_PRINCIPLE = "" 56 | 57 | // core-site 58 | IPC_CLIENT_TCPNODELAY = "ipc.client.tcpnodelay" 59 | DEFAULT_IPC_CLIENT_TCPNODELAY = false 60 | 61 | SECURITY_AUTHENTICATION = "hadoop.security.authentication" 62 | DEFAULT_AUTHENTICATION = "simple" 63 | ) 64 | 65 | type yarn_configuration struct { 66 | conf Configuration 67 | } 68 | 69 | type YarnConfiguration interface { 70 | // yarn-site 71 | GetRMAddress() (string, error) 72 | GetRMSchedulerAddress() (string, error) 73 | GetRMAdminAddress() (string, error) 74 | GetRMEnabledHA() (bool, error) 75 | GetRMs() ([]string, error) 76 | GetRMAdminAddressByID(rmID string) (string, error) 77 | GetRMAddressByID(rmID string) (string, error) 78 | GetResourceManagerKeytab() (string, error) 79 | GetResourceManagerPrincipal() (string, error) 80 | 81 | SetRMAddress(address string) error 82 | SetRMSchedulerAddress(address string) error 83 | 84 | // core-site 85 | GetIPCClientTcpNoDelay() (bool, error) 86 | GetSecurityAuthentication() (string, error) 87 | 88 | Get(key string, defaultValue string) (string, error) 89 | GetInt(key string, defaultValue int) (int, error) 90 | 91 | Set(key string, value string) error 92 | SetInt(key string, value int) error 93 | } 94 | 95 | func (yarnConf *yarn_configuration) Get(key string, defaultValue string) (string, error) { 96 | return yarnConf.conf.Get(key, defaultValue) 97 | } 98 | 99 | func (yarnConf *yarn_configuration) GetInt(key string, defaultValue int) (int, error) { 100 | return yarnConf.conf.GetInt(key, defaultValue) 101 | } 102 | 103 | func (yarnConf *yarn_configuration) GetRMAddress() (string, error) { 104 | return yarnConf.conf.Get(RM_ADDRESS, DEFAULT_RM_ADDRESS) 105 | } 106 | 107 | func (yarnConf *yarn_configuration) GetRMSchedulerAddress() (string, error) { 108 | return yarnConf.conf.Get(RM_SCHEDULER_ADDRESS, DEFAULT_RM_SCHEDULER_ADDRESS) 109 | } 110 | 111 | func (yarnConf *yarn_configuration) GetRMAdminAddress() (string, error) { 112 | return yarnConf.conf.Get(RM_ADMIN_ADDRESS, DEFAULT_RM_ADMIN_ADDRESS) 113 | } 114 | 115 | func (yarnConf *yarn_configuration) GetRMEnabledHA() (bool, error) { 116 | return yarnConf.conf.GetBool(RM_HA_ENABLED, DEFAULT_RM_HA_ENABLED) 117 | } 118 | 119 | func (yarnConf *yarn_configuration) GetRMs() ([]string, error) { 120 | rmIDs := make([]string, 0) 121 | allRMs, err := yarnConf.conf.Get(RM_HA_RM_IDS, "") 122 | if err != nil { 123 | return rmIDs, nil 124 | } 125 | rmIDs = strings.Split(allRMs, ",") 126 | return rmIDs, nil 127 | } 128 | 129 | func (yarnConf *yarn_configuration) GetRMAdminAddressByID(rmID string) (string, error) { 130 | // yarn.resourcemanager.admin.address.rm1 131 | rmAddrKey := fmt.Sprintf("%v.%v", RM_ADMIN_ADDRESS, rmID) 132 | return yarnConf.conf.Get(rmAddrKey, DEFAULT_RM_ADMIN_ADDRESS) 133 | } 134 | 135 | func (yarnConf *yarn_configuration) GetRMAddressByID(rmID string) (string, error) { 136 | // yarn.resourcemanager.address.rm1 137 | rmAddrKey := fmt.Sprintf("%v.%v", RM_ADDRESS, rmID) 138 | return yarnConf.conf.Get(rmAddrKey, DEFAULT_RM_ADDRESS) 139 | } 140 | 141 | func (yarnConf *yarn_configuration) GetResourceManagerKeytab() (string, error) { 142 | return yarnConf.conf.Get(RM_KEYTAB, DEFAULT_RM_KEYTAB) 143 | } 144 | 145 | func (yarnConf *yarn_configuration) GetResourceManagerPrincipal() (string, error) { 146 | return yarnConf.conf.Get(RM_PRINCIPLE, DEFAULT_RM_PRINCIPLE) 147 | } 148 | 149 | func (yarnConf *yarn_configuration) GetIPCClientTcpNoDelay() (bool, error) { 150 | // ipc.client.tcpnodelay 151 | return yarnConf.conf.GetBool(IPC_CLIENT_TCPNODELAY, DEFAULT_IPC_CLIENT_TCPNODELAY) 152 | } 153 | 154 | func (yarnConf *yarn_configuration) GetSecurityAuthentication() (string, error) { 155 | return yarnConf.conf.Get(SECURITY_AUTHENTICATION, DEFAULT_AUTHENTICATION) 156 | } 157 | 158 | func (yarnConf *yarn_configuration) Set(key string, value string) error { 159 | return yarnConf.conf.Set(key, value) 160 | } 161 | 162 | func (yarnConf *yarn_configuration) SetInt(key string, value int) error { 163 | return yarnConf.conf.SetInt(key, value) 164 | } 165 | 166 | func (yarnConf *yarn_configuration) SetRMAddress(address string) error { 167 | return yarnConf.conf.Set(RM_ADDRESS, address) 168 | } 169 | 170 | func (yarnConf *yarn_configuration) SetRMSchedulerAddress(address string) error { 171 | return yarnConf.conf.Set(RM_SCHEDULER_ADDRESS, address) 172 | } 173 | 174 | func NewYarnConfiguration(hadooConfDir string, clusterID string) (YarnConfiguration, error) { 175 | // for yarn-site.xml with cluster id, read from clusterid.yarn-site.xml 176 | c, err := NewConfigurationResources(hadooConfDir, []Resource{YARN_DEFAULT, YARN_SITE}, configPrefix(clusterID)) 177 | return &yarn_configuration{conf: c}, err 178 | } 179 | 180 | func configPrefix(clusterID string) string { 181 | if clusterID != "" { 182 | return clusterID + "." 183 | } 184 | return "" 185 | } 186 | -------------------------------------------------------------------------------- /pkg/controller/noderesource/yarn_resource.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2022 The Koordinator Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // NOTE: functions in this file can be overwritten for extension 18 | 19 | package noderesource 20 | 21 | import ( 22 | "encoding/json" 23 | 24 | corev1 "k8s.io/api/core/v1" 25 | "k8s.io/apimachinery/pkg/api/resource" 26 | quotav1 "k8s.io/apiserver/pkg/quota/v1" 27 | 28 | "github.com/koordinator-sh/koordinator/apis/extension" 29 | ) 30 | 31 | const ( 32 | BatchCPU = extension.BatchCPU 33 | BatchMemory = extension.BatchMemory 34 | 35 | PodYarnClusterIDAnnotationKey = "yarn.hadoop.apache.org/cluster-id" 36 | YARNAllocationName = "hadoop-yarn" 37 | YARNResourcePriority = extension.PriorityBatch 38 | ) 39 | 40 | func calculate(batchCPU resource.Quantity, batchMemory resource.Quantity) (int64, int64) { 41 | // TODO multiple ratio as buffer 42 | return batchCPU.ScaledValue(resource.Kilo), batchMemory.ScaledValue(resource.Mega) 43 | } 44 | 45 | func GetOriginExtendedAllocatableRes(annotations map[string]string) (corev1.ResourceList, error) { 46 | originAllocatable, err := GetOriginExtendedAllocatable(annotations) 47 | if originAllocatable == nil || err != nil { 48 | return nil, err 49 | } 50 | return originAllocatable.Resources, nil 51 | } 52 | 53 | func SetYARNAllocatedResource(annotations map[string]string, vcores int32, memoryMB int64) error { 54 | resources := map[corev1.ResourceName]resource.Quantity{ 55 | BatchCPU: *resource.NewQuantity(int64(vcores*1000), resource.DecimalSI), 56 | BatchMemory: *resource.NewQuantity(memoryMB*1024*1024, resource.BinarySI), 57 | } 58 | return SetThirdPartyAllocation(annotations, YARNAllocationName, YARNResourcePriority, resources) 59 | } 60 | 61 | func GetYARNAllocatedResource(annotations map[string]string) (corev1.ResourceList, error) { 62 | thirdPartyAllocation, err := GetThirdPartyAllocations(annotations) 63 | if thirdPartyAllocation == nil || err != nil { 64 | return nil, err 65 | } 66 | for _, alloc := range thirdPartyAllocation.Allocations { 67 | if alloc.Name == YARNAllocationName { 68 | return alloc.Resources, nil 69 | } 70 | } 71 | return nil, nil 72 | } 73 | 74 | // TODO mv the followings to koordiantor api 75 | const ( 76 | // batch resource can be shared with other allocators such as Hadoop YARN 77 | // record origin batch allocatable on node for calculating the batch allocatable of K8s and YARN, e.g. 78 | // k8s_batch_allocatable = origin_batch_allocatable - yarn_batch_requested 79 | // yarn_allocatable = origin_batch_allocatable - k8s_batch_requested 80 | NodeOriginExtendedAllocatableAnnotationKey = "node.koordinator.sh/originExtendedAllocatable" 81 | 82 | // record (batch) allocations of other schedulers such as YARN, which should be excluded before updating node extended resource 83 | NodeThirdPartyAllocationsAnnotationKey = "node.koordinator.sh/thirdPartyAllocations" 84 | ) 85 | 86 | type OriginAllocatable struct { 87 | Resources corev1.ResourceList `json:"resources,omitempty"` 88 | } 89 | 90 | func GetOriginExtendedAllocatable(annotations map[string]string) (*OriginAllocatable, error) { 91 | originAllocatableStr, exist := annotations[NodeOriginExtendedAllocatableAnnotationKey] 92 | if !exist { 93 | return nil, nil 94 | } 95 | originAllocatable := &OriginAllocatable{} 96 | if err := json.Unmarshal([]byte(originAllocatableStr), originAllocatable); err != nil { 97 | return nil, err 98 | } 99 | return originAllocatable, nil 100 | } 101 | 102 | func SetOriginExtendedAllocatableRes(annotations map[string]string, extendedAllocatable corev1.ResourceList) error { 103 | old, err := GetOriginExtendedAllocatable(annotations) 104 | if old == nil || err != nil { 105 | old = &OriginAllocatable{} 106 | } 107 | if old.Resources == nil { 108 | old.Resources = map[corev1.ResourceName]resource.Quantity{} 109 | } 110 | for resourceName, value := range extendedAllocatable { 111 | old.Resources[resourceName] = value 112 | } 113 | newStr, err := json.Marshal(old) 114 | if err != nil { 115 | return err 116 | } 117 | if annotations == nil { 118 | annotations = map[string]string{} 119 | } 120 | annotations[NodeOriginExtendedAllocatableAnnotationKey] = string(newStr) 121 | return nil 122 | } 123 | 124 | type ThirdPartyAllocations struct { 125 | Allocations []ThirdPartyAllocation `json:"allocations,omitempty"` 126 | } 127 | 128 | type ThirdPartyAllocation struct { 129 | Name string `json:"name"` 130 | Priority extension.PriorityClass `json:"priority"` 131 | Resources corev1.ResourceList `json:"resources,omitempty"` 132 | } 133 | 134 | func GetThirdPartyAllocations(annotations map[string]string) (*ThirdPartyAllocations, error) { 135 | valueStr, exist := annotations[NodeThirdPartyAllocationsAnnotationKey] 136 | if !exist { 137 | return nil, nil 138 | } 139 | object := &ThirdPartyAllocations{} 140 | if err := json.Unmarshal([]byte(valueStr), object); err != nil { 141 | return nil, err 142 | } 143 | return object, nil 144 | } 145 | 146 | func GetThirdPartyAllocatedResByPriority(annotations map[string]string, priority extension.PriorityClass) (corev1.ResourceList, error) { 147 | allocations, err := GetThirdPartyAllocations(annotations) 148 | if err != nil || allocations == nil { 149 | return nil, err 150 | } 151 | result := corev1.ResourceList{} 152 | for _, alloc := range allocations.Allocations { 153 | if alloc.Priority == priority { 154 | result = quotav1.Add(result, alloc.Resources) 155 | } 156 | } 157 | return result, nil 158 | } 159 | 160 | func SetThirdPartyAllocation(annotations map[string]string, name string, priority extension.PriorityClass, 161 | resource corev1.ResourceList) error { 162 | // parse or init old allocations 163 | oldAllocations, err := GetThirdPartyAllocations(annotations) 164 | if oldAllocations == nil || err != nil { 165 | oldAllocations = &ThirdPartyAllocations{} 166 | } 167 | if oldAllocations.Allocations == nil { 168 | oldAllocations.Allocations = make([]ThirdPartyAllocation, 0, 1) 169 | } 170 | 171 | // create or update old alloc 172 | newAlloc := ThirdPartyAllocation{ 173 | Name: name, 174 | Priority: priority, 175 | Resources: resource, 176 | } 177 | exist := false 178 | for i := range oldAllocations.Allocations { 179 | if oldAllocations.Allocations[i].Name == name { 180 | oldAllocations.Allocations[i] = newAlloc 181 | exist = true 182 | break 183 | } 184 | } 185 | if !exist { 186 | oldAllocations.Allocations = append(oldAllocations.Allocations, newAlloc) 187 | } 188 | 189 | // update allocation string 190 | newStr, err := json.Marshal(oldAllocations) 191 | if err != nil { 192 | return err 193 | } 194 | if annotations == nil { 195 | annotations = map[string]string{} 196 | } 197 | annotations[NodeThirdPartyAllocationsAnnotationKey] = string(newStr) 198 | return nil 199 | } 200 | -------------------------------------------------------------------------------- /pkg/yarn/apis/proto/hadoopcommon/RpcHeader.proto: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * These .proto interfaces are private and stable. 21 | * Please see http://wiki.apache.org/hadoop/Compatibility 22 | * for what changes are allowed for a *stable* .proto interface. 23 | */ 24 | 25 | syntax = "proto2"; 26 | option java_package = "org.apache.hadoop.ipc.protobuf"; 27 | option java_outer_classname = "RpcHeaderProtos"; 28 | option java_generate_equals_and_hash = true; 29 | package hadoop.common; 30 | 31 | /** 32 | * This is the rpc request header. It is sent with every rpc call. 33 | * 34 | * The format of RPC call is as follows: 35 | * +--------------------------------------------------------------+ 36 | * | Rpc length in bytes (4 bytes int) sum of next two parts | 37 | * +--------------------------------------------------------------+ 38 | * | RpcRequestHeaderProto - serialized delimited ie has len | 39 | * +--------------------------------------------------------------+ 40 | * | RpcRequest The actual rpc request | 41 | * | This request is serialized based on RpcKindProto | 42 | * +--------------------------------------------------------------+ 43 | * 44 | */ 45 | 46 | /** 47 | * RpcKind determine the rpcEngine and the serialization of the rpc request 48 | */ 49 | enum RpcKindProto { 50 | RPC_BUILTIN = 0; // Used for built in calls by tests 51 | RPC_WRITABLE = 1; // Use WritableRpcEngine 52 | RPC_PROTOCOL_BUFFER = 2; // Use ProtobufRpcEngine 53 | } 54 | 55 | 56 | 57 | /** 58 | * Used to pass through the information necessary to continue 59 | * a trace after an RPC is made. All we need is the traceid 60 | * (so we know the overarching trace this message is a part of), and 61 | * the id of the current span when this message was sent, so we know 62 | * what span caused the new span we will create when this message is received. 63 | */ 64 | message RPCTraceInfoProto { 65 | optional int64 traceId = 1; // parentIdHigh 66 | optional int64 parentId = 2; // parentIdLow 67 | 68 | } 69 | 70 | /** 71 | * Used to pass through the call context entry after an RPC is made. 72 | */ 73 | message RPCCallerContextProto { 74 | required string context = 1; 75 | optional bytes signature = 2; 76 | } 77 | 78 | message RpcRequestHeaderProto { // the header for the RpcRequest 79 | enum OperationProto { 80 | RPC_FINAL_PACKET = 0; // The final RPC Packet 81 | RPC_CONTINUATION_PACKET = 1; // not implemented yet 82 | RPC_CLOSE_CONNECTION = 2; // close the rpc connection 83 | } 84 | 85 | optional RpcKindProto rpcKind = 1; 86 | optional OperationProto rpcOp = 2; 87 | required sint32 callId = 3; // a sequence number that is sent back in response 88 | required bytes clientId = 4; // Globally unique client ID 89 | // clientId + callId uniquely identifies a request 90 | // retry count, 1 means this is the first retry 91 | optional sint32 retryCount = 5 [default = -1]; 92 | optional RPCTraceInfoProto traceInfo = 6; // tracing info 93 | optional RPCCallerContextProto callerContext = 7; // call context 94 | optional int64 stateId = 8; // The last seen Global State ID 95 | } 96 | 97 | 98 | 99 | /** 100 | * Rpc Response Header 101 | * +------------------------------------------------------------------+ 102 | * | Rpc total response length in bytes (4 bytes int) | 103 | * | (sum of next two parts) | 104 | * +------------------------------------------------------------------+ 105 | * | RpcResponseHeaderProto - serialized delimited ie has len | 106 | * +------------------------------------------------------------------+ 107 | * | if request is successful: | 108 | * | - RpcResponse - The actual rpc response bytes follow | 109 | * | the response header | 110 | * | This response is serialized based on RpcKindProto | 111 | * | if request fails : | 112 | * | The rpc response header contains the necessary info | 113 | * +------------------------------------------------------------------+ 114 | * 115 | * Note that rpc response header is also used when connection setup fails. 116 | * Ie the response looks like a rpc response with a fake callId. 117 | */ 118 | message RpcResponseHeaderProto { 119 | /** 120 | * 121 | * RpcStastus - success or failure 122 | * The reponseHeader's errDetail, exceptionClassName and errMsg contains 123 | * further details on the error 124 | **/ 125 | 126 | enum RpcStatusProto { 127 | SUCCESS = 0; // RPC succeeded 128 | ERROR = 1; // RPC or error - connection left open for future calls 129 | FATAL = 2; // Fatal error - connection closed 130 | } 131 | 132 | enum RpcErrorCodeProto { 133 | 134 | // Non-fatal Rpc error - connection left open for future rpc calls 135 | ERROR_APPLICATION = 1; // RPC Failed - rpc app threw exception 136 | ERROR_NO_SUCH_METHOD = 2; // Rpc error - no such method 137 | ERROR_NO_SUCH_PROTOCOL = 3; // Rpc error - no such protocol 138 | ERROR_RPC_SERVER = 4; // Rpc error on server side 139 | ERROR_SERIALIZING_RESPONSE = 5; // error serializign response 140 | ERROR_RPC_VERSION_MISMATCH = 6; // Rpc protocol version mismatch 141 | 142 | 143 | // Fatal Server side Rpc error - connection closed 144 | FATAL_UNKNOWN = 10; // unknown Fatal error 145 | FATAL_UNSUPPORTED_SERIALIZATION = 11; // IPC layer serilization type invalid 146 | FATAL_INVALID_RPC_HEADER = 12; // fields of RpcHeader are invalid 147 | FATAL_DESERIALIZING_REQUEST = 13; // could not deserilize rpc request 148 | FATAL_VERSION_MISMATCH = 14; // Ipc Layer version mismatch 149 | FATAL_UNAUTHORIZED = 15; // Auth failed 150 | } 151 | 152 | required uint32 callId = 1; // callId used in Request 153 | required RpcStatusProto status = 2; 154 | optional uint32 serverIpcVersionNum = 3; // Sent if success or fail 155 | optional string exceptionClassName = 4; // if request fails 156 | optional string errorMsg = 5; // if request fails, often contains strack trace 157 | optional RpcErrorCodeProto errorDetail = 6; // in case of error 158 | optional bytes clientId = 7; // Globally unique client ID 159 | optional sint32 retryCount = 8 [default = -1]; 160 | optional int64 stateId = 9; // The last written Global State ID 161 | } 162 | 163 | message RpcSaslProto { 164 | enum SaslState { 165 | SUCCESS = 0; 166 | NEGOTIATE = 1; 167 | INITIATE = 2; 168 | CHALLENGE = 3; 169 | RESPONSE = 4; 170 | WRAP = 5; 171 | } 172 | 173 | message SaslAuth { 174 | required string method = 1; 175 | required string mechanism = 2; 176 | optional string protocol = 3; 177 | optional string serverId = 4; 178 | optional bytes challenge = 5; 179 | } 180 | 181 | optional uint32 version = 1; 182 | required SaslState state = 2; 183 | optional bytes token = 3; 184 | repeated SaslAuth auths = 4; 185 | } 186 | --------------------------------------------------------------------------------