├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── config.yml │ ├── enhancement.md │ ├── good-first.md │ └── question.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── labeler.yml ├── release.yml └── workflows │ ├── auto-label-pr.yaml │ ├── auto-release.yaml │ ├── call-e2e-upgrade.yaml │ ├── call-e2e.yaml │ ├── call-release-helm.yaml │ ├── call-release-image-hamicore.yaml │ ├── call-release-image.yaml │ ├── call-release-notes.yaml │ ├── call-release-website.yaml │ ├── ci-image-scanning.yaml │ ├── ci.yaml │ ├── codeql-analysis.yml │ ├── lint-chart.yaml │ └── test-self-hosted.yaml ├── .gitignore ├── .gitmodules ├── .golangci.yaml ├── .trivyignore ├── AUTHORS.md ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DEPENDENCY.md ├── HAMi.jpg ├── LICENSE ├── MAINTAINERS.md ├── Makefile ├── Makefile.defs ├── NOTICE.txt ├── OWNERS ├── README.md ├── README_cn.md ├── SECURITY.md ├── VERSION ├── benchmarks ├── README.md ├── ai-benchmark │ ├── Dockerfile │ └── build.sh └── deployments │ ├── job-on-hami.yml │ └── job-on-nvidia-device-plugin.yml ├── charts ├── Makefile └── hami │ ├── Chart.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── device-plugin │ │ ├── configmap.yaml │ │ ├── daemonsetnvidia.yaml │ │ ├── monitorrole.yaml │ │ ├── monitorrolebinding.yaml │ │ ├── monitorservice.yaml │ │ ├── monitorserviceaccount.yaml │ │ └── runtime-class.yaml │ └── scheduler │ │ ├── certmanager.yaml │ │ ├── configmap.yaml │ │ ├── configmapnew.yaml │ │ ├── deployment.yaml │ │ ├── device-configmap.yaml │ │ ├── job-patch │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── job-createSecret.yaml │ │ ├── job-patchWebhook.yaml │ │ ├── psp.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ └── serviceaccount.yaml │ │ ├── rolebinding.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── webhook.yaml │ └── values.yaml ├── cmd ├── device-plugin │ └── nvidia │ │ ├── main.go │ │ ├── plugin-manager.go │ │ ├── vgpucfg.go │ │ └── watchers.go ├── scheduler │ ├── main.go │ └── metrics.go └── vGPUmonitor │ ├── build.sh │ ├── feedback.go │ ├── main.go │ ├── metrics.go │ ├── noderpc │ ├── noderpc.pb.go │ ├── noderpc.proto │ └── noderpc_grpc.pb.go │ ├── testcollector │ └── main.go │ └── validation.go ├── docker ├── Dockerfile ├── Dockerfile.hamicore ├── Dockerfile.hamimaster ├── Dockerfile.withlib ├── entrypoint.sh └── vgpu-init.sh ├── docs ├── CHANGELOG │ └── CHANGELOG-0.0.0.md ├── ascend910b-support.md ├── ascend910b-support_cn.md ├── benchmark.md ├── benchmark_cn.md ├── cambricon-mlu-support.md ├── cambricon-mlu-support_cn.md ├── config.md ├── config_cn.md ├── dashboard.md ├── dashboard_cn.md ├── develop │ ├── design.md │ ├── dynamic-mig.md │ ├── imgs │ │ ├── flowchart.jpeg │ │ ├── gpu-scheduler-policy-demo.png │ │ ├── hami-dynamic-mig-procedure.png │ │ ├── hami-dynamic-mig-structure.png │ │ ├── node-shceduler-policy-demo.png │ │ ├── offline_validation.png │ │ ├── protocol_pod.png │ │ ├── protocol_register.png │ │ └── scheduler-policy-story.png │ ├── protocol.md │ ├── roadmap.md │ ├── scheduler-policy.md │ └── tasklist.md ├── dynamic-mig-support.md ├── dynamic-mig-support_cn.md ├── enflame-gcu-suport.md ├── enflame-gcu-support_cn.md ├── gpu-dashboard.json ├── how-to-profiling-scheduler.md ├── how-to-profiling-scheduler_cn.md ├── how-to-use-volcano-vgpu.md ├── hygon-dcu-support.md ├── hygon-dcu-support_cn.md ├── iluvatar-gpu-support.md ├── iluvatar-gpu-support_cn.md ├── metax-support.md ├── metax-support_cn.md ├── mind-map │ ├── HAMI-VGPU-mind-map-Chinese.png │ ├── HAMI-VGPU-mind-map-Chinese.xmind │ ├── HAMI-VGPU-mind-map-English.png │ ├── HAMI-VGPU-mind-map-English.xmind │ └── readme ├── mthreads-support.md ├── mthreads-support_cn.md ├── offline-install.md ├── proposals │ ├── e2e_test.md │ ├── e2e_test.png │ ├── gpu-topo-policy.md │ ├── gpu_utilization.png │ └── gpu_utilization_cn.md ├── release-process.md └── scheduler-event-log.md ├── example.yaml ├── examples ├── ascend │ ├── job-310P.yaml │ ├── job-910A.yaml │ ├── job-910B2.yaml │ ├── job-910B3.yaml │ └── job-910B4.yaml ├── enflame │ ├── default_use.yaml │ └── use_exclusive.yaml ├── hygon │ ├── default_use.yaml │ ├── specify_card_type_not_use.yaml │ └── specify_card_type_to_use.yaml ├── iluvatar │ ├── default_use.yaml │ ├── multi-containers.yaml │ └── multi-devices.yaml ├── metax │ ├── gpu │ │ ├── binpack.yaml │ │ ├── default_use.yaml │ │ └── spread.yaml │ └── sgpu │ │ ├── allocate_exclusive.yaml │ │ ├── allocate_specific_gpu.yaml │ │ ├── allocate_vmemory_MiB.yaml │ │ ├── default_use.yaml │ │ └── multi-containers.yaml ├── mlu │ ├── allocate_whole.yaml │ └── default_use.yaml ├── mthreads │ ├── default_use.yaml │ ├── multi_cards.yaml │ └── use_exclusive.yaml └── nvidia │ ├── default_use.yaml │ ├── default_use_legacy.yaml │ ├── dynamic_mig_example.yaml │ ├── example.yaml │ ├── mig_example.yaml │ ├── specify_card_type_not_use.yaml │ ├── specify_card_type_to_use.yaml │ ├── specify_scheduling_policy.yaml │ ├── specify_uuid_not_use.yaml │ ├── specify_uuid_to_use.yaml │ ├── use_as_normal.yaml │ ├── use_exclusive_card.yaml │ ├── use_memory_fraction.yaml │ └── use_sharing_card.yaml ├── go.mod ├── go.sum ├── hack ├── .import-aliases ├── boilerplate │ └── boilerplate.go.txt ├── build.sh ├── deploy-helm.sh ├── e2e-test-setup.sh ├── e2e-test.sh ├── kubeconfig-demo.yaml ├── tools │ ├── preferredimports │ │ └── preferredimports.go │ └── tools.go ├── unit-test.sh ├── update-generated-api.sh ├── util.sh ├── verify-all.sh ├── verify-chart-version.sh ├── verify-import-aliases.sh ├── verify-license.sh └── verify-staticcheck.sh ├── imgs ├── arch.png ├── benchmark.png ├── benchmark_inf.png ├── benchmark_train.png ├── cncf-logo.png ├── example.png ├── hami-arch.png ├── hami-arch.pptx ├── hami-graph-color.png ├── hami-horizontal-colordark.png ├── hami-vgpu-metrics-dashboard.png ├── hard_limit.jpg ├── metax_binpack.png ├── metax_spread.png ├── metax_topo.png └── release-process.png ├── lib └── nvidia │ └── ld.so.preload ├── pkg ├── device-plugin │ └── nvidiadevice │ │ └── nvinternal │ │ ├── cdi │ │ ├── api.go │ │ ├── api_mock.go │ │ ├── cdi.go │ │ ├── factory.go │ │ ├── null.go │ │ └── options.go │ │ ├── info │ │ └── version.go │ │ ├── mig │ │ └── mig.go │ │ ├── plugin │ │ ├── api.go │ │ ├── manager │ │ │ ├── api.go │ │ │ ├── factory.go │ │ │ ├── null.go │ │ │ ├── nvml.go │ │ │ ├── options.go │ │ │ └── tegra.go │ │ ├── register.go │ │ ├── register_test.go │ │ ├── server.go │ │ ├── server_test.go │ │ ├── util.go │ │ └── util_test.go │ │ └── rm │ │ ├── allocate.go │ │ ├── device_map.go │ │ ├── device_map_test.go │ │ ├── devices.go │ │ ├── health.go │ │ ├── health_test.go │ │ ├── helper.go │ │ ├── nvml_devices.go │ │ ├── nvml_devices_test.go │ │ ├── nvml_manager.go │ │ ├── rm.go │ │ ├── tegra_devices.go │ │ ├── tegra_manager.go │ │ └── wsl_devices.go ├── device │ ├── ascend │ │ ├── device.go │ │ ├── device_test.go │ │ └── vnpu.go │ ├── cambricon │ │ ├── device.go │ │ └── device_test.go │ ├── devices.go │ ├── devices_test.go │ ├── enflame │ │ ├── device.go │ │ └── device_test.go │ ├── hygon │ │ ├── device.go │ │ └── device_test.go │ ├── iluvatar │ │ ├── device.go │ │ └── device_test.go │ ├── metax │ │ ├── config.go │ │ ├── device.go │ │ ├── device_test.go │ │ ├── protocol.go │ │ ├── protocol_test.go │ │ ├── sdevice.go │ │ └── sdevice_test.go │ ├── mthreads │ │ ├── device.go │ │ └── device_test.go │ └── nvidia │ │ ├── calculate_score.go │ │ ├── calculate_score_test.go │ │ ├── device.go │ │ ├── device_test.go │ │ └── links.go ├── k8sutil │ ├── pod.go │ └── pod_test.go ├── monitor │ └── nvidia │ │ ├── cudevshr.go │ │ ├── v0 │ │ ├── spec.go │ │ └── spec_test.go │ │ └── v1 │ │ ├── spec.go │ │ └── spec_test.go ├── oci │ ├── runtime.go │ ├── runtime_exec.go │ ├── runtime_exec_test.go │ ├── runtime_mock.go │ ├── spec.go │ └── spec_mock.go ├── scheduler │ ├── config │ │ └── config.go │ ├── event.go │ ├── event_test.go │ ├── nodes.go │ ├── nodes_test.go │ ├── pod_test.go │ ├── pods.go │ ├── policy │ │ ├── constant.go │ │ ├── gpu_policy.go │ │ ├── gpu_policy_test.go │ │ ├── node_policy.go │ │ └── node_policy_test.go │ ├── routes │ │ └── route.go │ ├── scheduler.go │ ├── scheduler_test.go │ ├── score.go │ ├── score_test.go │ ├── webhook.go │ └── webhook_test.go ├── util │ ├── client │ │ ├── client.go │ │ ├── client_test.go │ │ ├── options.go │ │ └── testdata │ │ │ ├── invalid_kubeconfig.yaml │ │ │ └── kubeconfig.yaml │ ├── flag │ │ ├── flags.go │ │ └── flags_test.go │ ├── nodelock │ │ ├── nodelock.go │ │ └── nodelock_test.go │ ├── types.go │ ├── util.go │ └── util_test.go └── version │ ├── version.go │ └── version_test.go ├── test ├── e2e │ ├── node │ │ ├── test_node.go │ │ └── test_suite_test.go │ ├── pod │ │ ├── test_pod.go │ │ └── test_suite_test.go │ └── test_suite_test.go └── utils │ ├── common.go │ ├── config.go │ ├── event.go │ ├── node.go │ └── pod.go └── version.mk /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a bug encountered while using HAMi. 4 | labels: kind/bug 5 | 6 | --- 7 | 8 | 10 | 11 | **What happened**: 12 | 13 | **What you expected to happen**: 14 | 15 | **How to reproduce it (as minimally and precisely as possible)**: 16 | 17 | **Anything else we need to know?**: 18 | 19 | - The output of `nvidia-smi -a` on your host 20 | - Your docker or containerd configuration file (e.g: `/etc/docker/daemon.json`) 21 | - The hami-device-plugin container logs 22 | - The hami-scheduler container logs 23 | - The kubelet logs on the node (e.g: `sudo journalctl -r -u kubelet`) 24 | - Any relevant kernel output lines from `dmesg` 25 | 26 | **Environment**: 27 | - HAMi version: 28 | - nvidia driver or other AI device driver version: 29 | - Docker version from `docker version` 30 | - Docker command, image and tag used 31 | - Kernel version from `uname -a` 32 | - Others: 33 | 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: FAQ 3 | url: https://github.com/Project-HAMi/HAMi/issues/646 4 | about: Frequently asked questions and common solutions. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement Request 3 | about: Suggest an enhancement to the project 4 | labels: kind/feature 5 | 6 | --- 7 | 8 | 9 | **What would you like to be added**: 10 | 11 | **What type of PR is this?** 12 | 13 | /kind feature 14 | 15 | **What this PR does / why we need it**: 16 | 17 | **Which issue(s) this PR fixes**: 18 | Fixes # 19 | 20 | **Special notes for your reviewer**: 21 | 22 | **Does this PR introduce a user-facing change?**: -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/good-first.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Good First Issue 3 | about: Publish a good first issue 4 | labels: good first issue 5 | 6 | --- 7 | 8 | 10 | 11 | **Task description**: 12 | 13 | **Solution**: 14 | 15 | **Who can join or take the task**: 16 | 17 | The good first issue is intended for `first-time contributors` to get started on his/her contributor journey. 18 | 19 | After a contributor has successfully completed 1-2 good first issue's, 20 | they should be ready to move on to `help wanted` items, saving the remaining `good first issue` for other new contributors. 21 | 22 | **How to join or take the task**: 23 | 24 | Just reply on the issue with the message `/assign` in a separate line. 25 | 26 | Then, the issue will be assigned to you. 27 | 28 | **How to ask for help**: 29 | 30 | If you need help or have questions, please feel free to ask on this issue. 31 | The issue author or other members of the community will guide you through the contribution process. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Question relating to HAMi. 4 | labels: kind/question 5 | 6 | --- 7 | 8 | **Please provide an in-depth description of the question you have**: 9 | 10 | **What do you think about this question?**: 11 | 12 | **Environment**: 13 | - HAMi version: 14 | - Kubernetes version: 15 | - Others: -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **What type of PR is this?** 2 | 3 | 14 | 15 | **What this PR does / why we need it**: 16 | 17 | **Which issue(s) this PR fixes**: 18 | Fixes # 19 | 20 | **Special notes for your reviewer**: 21 | 22 | **Does this PR introduce a user-facing change?**: -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | 2 | # To get started with Dependabot version updates, you'll need to specify which 3 | # package ecosystems to update and where the package manifests are located. 4 | # Please see the documentation for all configuration options: 5 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 6 | 7 | 8 | version: 2 9 | updates: 10 | - package-ecosystem: "gomod" 11 | directory: "/" 12 | schedule: 13 | interval: "daily" 14 | - package-ecosystem: "docker" 15 | directory: "/docker" 16 | schedule: 17 | interval: "daily" 18 | - package-ecosystem: "github-actions" 19 | directory: "/" 20 | schedule: 21 | interval: "daily" 22 | -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | "kind/bug": 2 | - '^[Ff]ix(\(.*\))?:?.*' 3 | "kind/cleanup": 4 | - '^[Cc]hore(\(.*\))?:?.*' 5 | "kind/documentation": 6 | - '^[Dd]ocs?(\(.*\))?:?.*' 7 | "kind/enhancement": 8 | - '^[Rr]efactor(\(.*\))?:?.*' 9 | "kind/feature": 10 | - '^[Ff]eat(\(.*\))?:?.*' 11 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | # .github/release.yml 2 | changelog: 3 | exclude: 4 | labels: 5 | - ignore-for-release 6 | - github-actions 7 | authors: 8 | - dependabot[bot] 9 | categories: 10 | - title: ✨ New Features 11 | labels: 12 | - feature 13 | - design 14 | - enhancement 15 | - title: 🐛 Bug Fixes 16 | labels: 17 | - bug 18 | - title: 📚 Documentation 19 | labels: 20 | - documentation 21 | - title: ⬆️ Dependencies 22 | labels: 23 | - dependencies 24 | - title: 💥 Breaking Changes 25 | labels: 26 | - breaking-change 27 | - title: 🔨 Other Changes 28 | labels: 29 | - "*" 30 | -------------------------------------------------------------------------------- /.github/workflows/auto-label-pr.yaml: -------------------------------------------------------------------------------- 1 | name: "PR Labeler" 2 | on: 3 | pull_request_target: 4 | types: [opened, edited] 5 | 6 | permissions: 7 | issues: write 8 | pull-requests: write 9 | contents: read 10 | 11 | jobs: 12 | labeling: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: github/issue-labeler@v3.4 16 | with: 17 | configuration-path: .github/labeler.yml 18 | enable-versioned-regex: 0 19 | sync-labels: 1 20 | include-title: 1 21 | include-body: 0 22 | repo-token: ${{ github.token }} 23 | -------------------------------------------------------------------------------- /.github/workflows/call-e2e-upgrade.yaml: -------------------------------------------------------------------------------- 1 | name: Call e2e upgrade test 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | ref: 7 | required: true 8 | type: string 9 | permissions: write-all 10 | 11 | jobs: 12 | upgrade-e2e: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: e2e upgrade test 16 | # https://github.com/actions/virtual-environments/issues/709 17 | run: | 18 | echo "Need to add e2e upgrade test" 19 | -------------------------------------------------------------------------------- /.github/workflows/call-e2e.yaml: -------------------------------------------------------------------------------- 1 | name: Call e2e test 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | ref: 7 | description: 'Reference id to run tests' 8 | required: true 9 | type: string 10 | type: 11 | description: 'E2E type' 12 | required: true 13 | type: string 14 | default: pullrequest 15 | 16 | jobs: 17 | e2e-test: 18 | strategy: 19 | matrix: 20 | include: 21 | - device: nvidia 22 | type: tesla-p4 23 | # - device: nvidia 24 | # type: rtx-4090 25 | # - device: huawei 26 | # type: ascend-910b 27 | runs-on: [ "${{ matrix.device }}", "${{ matrix.type }}" ] 28 | environment: ${{ matrix.device }} 29 | env: 30 | E2E_TYPE: ${{ inputs.type }} 31 | HAMI_VERSION: ${{ inputs.ref }} 32 | steps: 33 | - name: checkout code 34 | uses: actions/checkout@v4 35 | 36 | - name: install Go 37 | uses: actions/setup-go@v5 38 | with: 39 | go-version: "1.21" 40 | 41 | - name: setup e2e env 42 | run: | 43 | make e2e-env-setup 44 | 45 | - name: download hami helm 46 | if: inputs.type == 'pullrequest' 47 | uses: actions/download-artifact@v4 48 | with: 49 | name: chart_package_artifact 50 | path: charts/ 51 | 52 | - name: download hami image 53 | if: inputs.type == 'pullrequest' 54 | uses: actions/download-artifact@v4 55 | with: 56 | name: hami-image 57 | path: ./image 58 | 59 | - name: load e2e image 60 | if: inputs.type == 'pullrequest' 61 | run: | 62 | echo "Loading Docker image from image.tar..." 63 | if [ -z "${VSPHERE_GPU_VM_IP}" ]; then 64 | echo "Error: VSPHERE_GPU_VM_IP is not defined!" 65 | exit 1 66 | fi 67 | scp ./image/image.tar root@$VSPHERE_GPU_VM_IP:/home/ 68 | ssh root@$VSPHERE_GPU_VM_IP "nerdctl load -i /home/image.tar" 69 | ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami" 70 | 71 | - name: deploy hami helm 72 | run: | 73 | make helm-deploy 74 | 75 | - name: e2e test 76 | run: | 77 | make e2e-test 78 | -------------------------------------------------------------------------------- /.github/workflows/call-release-website.yaml: -------------------------------------------------------------------------------- 1 | name: Call Release webiste 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | ref: 7 | required: true 8 | type: string 9 | permissions: write-all 10 | 11 | jobs: 12 | build-website: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: release hami website 16 | # https://github.com/actions/virtual-environments/issues/709 17 | run: | 18 | echo "Need to publish hami website" 19 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | workflow_dispatch: 10 | push: 11 | branches: ["master","dev"] 12 | paths-ignore: 13 | - "**/*.json" 14 | - "**/*.md" 15 | - "**/*.txt" 16 | - "**/*.yml" 17 | schedule: 18 | - cron: "0 4 * * 6" 19 | 20 | permissions: 21 | security-events: write 22 | # required to fetch internal or private CodeQL packs 23 | packages: read 24 | 25 | # only required for workflows in private repositories 26 | actions: read 27 | contents: read 28 | 29 | jobs: 30 | analyze: 31 | name: Analyze 32 | runs-on: ubuntu-latest 33 | if: github.repository == 'Project-HAMi/HAMi' 34 | 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | language: ["go"] 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v4 43 | - name: Checkout submodule 44 | uses: Mushus/checkout-submodule@v1.0.1 45 | with: 46 | basePath: # optional, default is . 47 | submodulePath: libvgpu 48 | - if: matrix.language == 'go' 49 | name: Set go version 50 | uses: actions/setup-go@v5 51 | with: 52 | go-version-file: go.mod 53 | 54 | # Initializes the CodeQL tools for scanning. 55 | - name: Initialize CodeQL 56 | uses: github/codeql-action/init@v3 57 | with: 58 | languages: ${{ matrix.language }} 59 | # If you wish to specify custom queries, you can do so here or in a config file. 60 | # By default, queries listed here will override any specified in a config file. 61 | # Prefix the list here with "+" to use these queries and those in the config file. 62 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 63 | 64 | - name: Perform CodeQL Analysis 65 | uses: github/codeql-action/analyze@v3 66 | -------------------------------------------------------------------------------- /.github/workflows/lint-chart.yaml: -------------------------------------------------------------------------------- 1 | name: Chart Lint 2 | 3 | on: 4 | push: 5 | # Exclude branches created by Dependabot to avoid triggering current workflow 6 | # for PRs initiated by Dependabot. 7 | branches-ignore: 8 | - 'dependabot/**' 9 | pull_request: 10 | paths: 11 | - "charts/**" 12 | 13 | jobs: 14 | chart-lint-test: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Set up Helm 23 | uses: azure/setup-helm@v4 24 | with: 25 | version: v3.7.1 26 | - name: Lint Chart 27 | run: | 28 | make lint_chart 29 | - name: Check chart version 30 | run: bash ./hack/verify-chart-version.sh 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/test-self-hosted.yaml: -------------------------------------------------------------------------------- 1 | name: Test self-hosted-runner 2 | 3 | on: 4 | push: 5 | # Exclude branches created by Dependabot to avoid triggering current workflow 6 | # for PRs initiated by Dependabot. 7 | branches-ignore: 8 | - 'dependabot/**' 9 | pull_request: 10 | paths: 11 | - "charts/**" 12 | 13 | jobs: 14 | e2e: 15 | runs-on: self-hosted 16 | steps: 17 | - name: e2e test 18 | # https://github.com/actions/virtual-environments/issues/709 19 | run: | 20 | echo "Need to add e2e test" 21 | 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | run_device_plugin.sh 3 | run_scheduler.sh 4 | device_plugin.sh 5 | libvgpu/build 6 | updateso.sh 7 | libvgpu.so 8 | .idea 9 | vendor 10 | license 11 | vgpuvalidator 12 | _output/ 13 | coverage.out 14 | .DS_Store 15 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libvgpu"] 2 | path = libvgpu 3 | url = https://github.com/Project-HAMi/HAMi-core.git 4 | branch = main 5 | -------------------------------------------------------------------------------- /.golangci.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | run: 3 | concurrency: 4 4 | modules-download-mode: readonly 5 | output: 6 | formats: 7 | text: 8 | path: stdout 9 | print-linter-name: true 10 | print-issued-lines: true 11 | colors: true 12 | linters: 13 | default: none 14 | enable: 15 | - asciicheck 16 | - forcetypeassert 17 | - godot 18 | - misspell 19 | - staticcheck 20 | settings: 21 | dupl: 22 | threshold: 800 23 | errcheck: 24 | check-type-assertions: true 25 | check-blank: true 26 | errorlint: 27 | errorf: true 28 | asserts: true 29 | comparison: true 30 | goconst: 31 | min-len: 3 32 | min-occurrences: 3 33 | gocritic: 34 | disabled-checks: 35 | - commentedOutCode 36 | - whyNoLint 37 | enabled-tags: 38 | - diagnostic 39 | - experimental 40 | - opinionated 41 | - performance 42 | - style 43 | settings: 44 | hugeParam: 45 | sizeThreshold: 80 46 | rangeExprCopy: 47 | sizeThreshold: 512 48 | rangeValCopy: 49 | sizeThreshold: 128 50 | gocyclo: 51 | min-complexity: 20 52 | godot: 53 | scope: declarations 54 | capital: false 55 | nestif: 56 | min-complexity: 20 57 | exclusions: 58 | generated: lax 59 | presets: 60 | - comments 61 | - common-false-positives 62 | - legacy 63 | - std-error-handling 64 | paths: 65 | - third_party$ 66 | - builtin$ 67 | - examples$ 68 | - pkg/device-plugin 69 | issues: 70 | uniq-by-line: true 71 | formatters: 72 | enable: 73 | - gofmt 74 | - goimports 75 | settings: 76 | gofmt: 77 | simplify: true 78 | gofumpt: 79 | extra-rules: true 80 | goimports: 81 | local-prefixes: 82 | - github.com/Project-HAMi/HAMi 83 | exclusions: 84 | generated: lax 85 | paths: 86 | - third_party$ 87 | - builtin$ 88 | - examples$ 89 | -------------------------------------------------------------------------------- /.trivyignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/.trivyignore -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | - Please check [HAMi Community Membership](https://github.com/Project-HAMi/community/blob/main/community-membership.md) to find how to be a contributor. 4 | - Here is the full list of the [MAINTAINERS](./MAINTAINERS.md). 5 | 6 | The following people, in alphabetical order, have either authored or signed off on commits in the HAMi repository: 7 | 8 | 9 | | Contributor | Email | 10 | |-----------------|-----------| 11 | | [archlitchi](https://github.com/archlitchi) | archlitchi@gmail.com| 12 | | [atttx123](https://github.com/atttx123) | - | 13 | | [chaunceyjiang](https://github.com/chaunceyjiang) | chaunceyjiang@gmail.com| 14 | | [CoderTH](https://github.com/CoderTH) | - | 15 | | [gsakun](https://github.com/gsakun) | - | 16 | | [lengrongfu](https://github.com/lengrongfu) | - | 17 | | [ouyangluwei](https://github.com/ouyangluwei163) | ouyangluwei@riseunion.io | 18 | | peizhaoyou | peizhaoyou@4paradigm.com | 19 | | [wawa0210](https://github.com/wawa0210) | xiaozhang0210@hotmail.com | 20 | | [whybeyoung](https://github.com/whybeyoung) | - | 21 | | [yinyu](https://github.com/Nimbus318) | nimbus-nimo@proton.me | 22 | | [yangshiqi](https://github.com/yangshiqi) | yangshiqi@riseunion.io | 23 | | zhengbingxian | - | 24 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # HAMi Community Code of Conduct 2 | 3 | Please refer to our [HAMi Community Code of Conduct](https://github.com/Project-HAMi/community/blob/main/CODE-OF-CONDUCT.md). 4 | -------------------------------------------------------------------------------- /DEPENDENCY.md: -------------------------------------------------------------------------------- 1 | # Environment Dependencies Policy 2 | 3 | ## Purpose 4 | 5 | This policy establishes guidelines for managing third-party packages in the HAMi repository. Its goal is to ensure that all dependencies are secure, up-to-date, and necessary for the project’s functionality. 6 | 7 | ## Scope 8 | 9 | This policy applies to all maintainers of the HAMi repository and governs all third-party packages incorporated into the project. 10 | 11 | ## Policy 12 | 13 | Maintainers must adhere to the following when incorporating third-party packages: 14 | 15 | - **Necessity:** Include only those packages that are essential to the project’s functionality. 16 | - **Latest Stable Versions:** Use the latest stable releases whenever possible. 17 | - **Security:** Avoid packages with known security vulnerabilities. 18 | - **Version Pinning:** Lock all dependencies to specific versions to maintain consistency. 19 | - **Dependency Management:** Utilize an appropriate dependency management tool (e.g., Go modules, npm, pip) to handle third-party packages. 20 | - **Testing:** Ensure that any new dependency passes all automated tests before integration. 21 | 22 | ## Procedure 23 | 24 | When adding a new third-party package, maintainers should: 25 | 26 | 1. **Assess Need:** Determine whether the package is truly necessary for the project. 27 | 2. **Conduct Research:** Review the package’s maintenance status and reputation within the community. 28 | 3. **Select Version:** Opt for the latest stable version that meets the project’s requirements. 29 | 4. **Pin the Version:** Explicitly pin the dependency to the chosen version within the repository. 30 | 5. **Update Documentation:** Revise the project documentation to include details about the new dependency. 31 | 32 | ## Archive/Deprecation 33 | 34 | If a third-party package becomes deprecated or discontinued, maintainers must promptly identify and integrate a suitable alternative while updating the documentation accordingly. 35 | 36 | ## Enforcement 37 | 38 | Compliance with this policy is monitored by the HAMi maintainers. All dependency-related changes are subject to peer review to ensure adherence to these guidelines. 39 | 40 | ## Exceptions 41 | 42 | Exceptions to this policy may be granted by the HAMi project lead on a case-by-case basis. Any exceptions must be documented with a clear rationale. 43 | 44 | ## Credits 45 | 46 | This policy has been adapted and optimized based on guidelines from the [Kubescape Community](https://github.com/kubescape/kubescape/blob/master/docs/environment-dependencies-policy.md). -------------------------------------------------------------------------------- /HAMi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/HAMi.jpg -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | # Maintainers 2 | 3 | - Please check [HAMi Community Membership](https://github.com/Project-HAMi/community/blob/main/community-membership.md) to find how to level up through the project. 4 | - Please see [Contributors](./AUTHORS.md) for the full list of contributors to the project. 5 | 6 | ## HAMi Committers 7 | 8 | | Maintainer | Email | Employer | 9 | |---------------------------------------------------|-----------|-----------| 10 | | [Li Mengxuan](https://github.com/archlitchi) | archlitchi@gmail.com | [dynamia.ai](https://www.dynamia.ai/) | 11 | | [Xiao Zhang](https://github.com/wawa0210) | xiaozhang0210@hotmail.com | [dynamia.ai](https://www.dynamia.ai/) | 12 | | [Wang Leibo](https://github.com/william-wang) | wang.platform@gmail.com | [HuaweiCloud](https://www.huaweicloud.com/) | 13 | | [Yin Yu](https://github.com/Nimbus318) | nimbus-nimo@proton.me | Independent Developer | 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ##### Global variables ##### 2 | include version.mk Makefile.defs 3 | 4 | all: build 5 | 6 | docker: 7 | docker build \ 8 | --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ 9 | --build-arg TARGET_ARCH=${TARGET_ARCH} \ 10 | --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ 11 | --build-arg DEST_DIR=${DEST_DIR} \ 12 | --build-arg VERSION=${VERSION} \ 13 | --build-arg GOPROXY=https://goproxy.cn,direct \ 14 | . -f=docker/Dockerfile -t ${IMG_TAG} 15 | 16 | dockerwithlib: 17 | docker build \ 18 | --no-cache \ 19 | --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ 20 | --build-arg TARGET_ARCH=${TARGET_ARCH} \ 21 | --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ 22 | --build-arg DEST_DIR=${DEST_DIR} \ 23 | --build-arg VERSION=${VERSION} \ 24 | --build-arg GOPROXY=https://goproxy.cn,direct \ 25 | . -f=docker/Dockerfile.withlib -t ${IMG_TAG} 26 | 27 | tidy: 28 | $(GO) mod tidy 29 | 30 | proto: 31 | $(GO) get github.com/gogo/protobuf/protoc-gen-gofast@v1.3.2 32 | protoc --gofast_out=plugins=grpc:. ./pkg/api/*.proto 33 | 34 | build: $(CMDS) $(DEVICES) 35 | 36 | $(CMDS): 37 | $(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@ 38 | 39 | $(DEVICES): 40 | $(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/info.version=$(VERSION)' -o ${OUTPUT_DIR}/$@-device-plugin ./cmd/device-plugin/$@ 41 | 42 | clean: 43 | $(GO) clean -r -x ./cmd/... 44 | -rm -rf $(OUTPUT_DIR) 45 | 46 | .PHONY: all build docker clean test $(CMDS) 47 | 48 | test: 49 | mkdir -p ./_output/coverage/ 50 | bash hack/unit-test.sh 51 | 52 | lint: 53 | bash hack/verify-staticcheck.sh 54 | 55 | .PHONY: verify 56 | verify: 57 | hack/verify-all.sh 58 | 59 | .PHONY: lint_dockerfile 60 | lint_dockerfile: 61 | @ docker run --rm \ 62 | -v $(ROOT_DIR)/.trivyignore:/.trivyignore \ 63 | -v /tmp/trivy:/root/trivy.cache/ \ 64 | -v $(ROOT_DIR):/tmp/src \ 65 | aquasec/trivy:$(TRIVY_VERSION) config --exit-code 1 --severity $(LINT_TRIVY_SEVERITY_LEVEL) /tmp/src/docker ; \ 66 | (($$?==0)) || { echo "error, failed to check dockerfile trivy" && exit 1 ; } ; \ 67 | echo "dockerfile trivy check: pass" 68 | 69 | .PHONY: lint_chart 70 | lint_chart: 71 | @ docker run --rm \ 72 | -v $(ROOT_DIR)/.trivyignore:/.trivyignore \ 73 | -v /tmp/trivy:/root/trivy.cache/ \ 74 | -v $(ROOT_DIR):/tmp/src \ 75 | aquasec/trivy:$(TRIVY_VERSION) config --exit-code 1 --severity $(LINT_TRIVY_SEVERITY_LEVEL) /tmp/src/charts ; \ 76 | (($$?==0)) || { echo "error, failed to check chart trivy" && exit 1 ; } ; \ 77 | echo "chart trivy check: pass" 78 | 79 | .PHONY: e2e-env-setup 80 | e2e-env-setup: 81 | ./hack/e2e-test-setup.sh 82 | 83 | .PHONY: helm-deploy 84 | helm-deploy: 85 | ./hack/deploy-helm.sh "${E2E_TYPE}" "${KUBE_CONF}" "${HAMI_VERSION}" 86 | 87 | .PHONY: e2e-test 88 | e2e-test: 89 | ./hack/e2e-test.sh "${E2E_TYPE}" "${KUBE_CONF}" 90 | -------------------------------------------------------------------------------- /Makefile.defs: -------------------------------------------------------------------------------- 1 | 2 | SHELL := /bin/bash 3 | .SHELLFLAGS := -eu -o pipefail -c 4 | 5 | ROOT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 6 | 7 | INSTALL = install 8 | 9 | PREFIX?=/usr 10 | BINDIR?=$(PREFIX)/bin 11 | TARGETARCH ?= amd64 12 | 13 | DESTDIR_BIN ?= $(ROOT_DIR)/output/$(TARGETARCH)/bin 14 | DESTDIR_BASH_COMPLETION ?= $(ROOT_DIR)/output/$(TARGETARCH)/bash-completion 15 | 16 | VERSION?="" 17 | ifeq ($(VERSION), "") 18 | VERSION=$(shell cat $(dir $(lastword $(MAKEFILE_LIST)))/VERSION) 19 | endif 20 | 21 | ECHO_GEN=echo " GEN $(RELATIVE_DIR)/" 22 | 23 | LINT_TRIVY_SEVERITY_LEVEL ?= CRITICAL 24 | TRIVY_VERSION=0.36.0 25 | 26 | .PHONY: print-version 27 | print-version: 28 | @echo $(VERSION) 29 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | HAMi(https://project-hami.io/) 2 | Copyright HAMi Contributors 3 | 4 | This product includes software developed by 5 | NVIDIA CORPORATION (https://www.nvidia.com). 6 | Copyright (c) NVIDIA CORPORATION. All rights reserved. 7 | 8 | This product includes software developed by 9 | The HAMi Authors. 10 | Copyright 2024 The HAMi Authors. 11 | 12 | Both are licensed under the Apache License, Version 2.0. 13 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | reviewers: 2 | - archlitchi 3 | - wawa0210 4 | - chaunceyjiang 5 | - lengrongfu 6 | approvers: 7 | - archlitchi 8 | - wawa0210 9 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | The following table outlines which versions of HAMi receive security updates: 6 | 7 | | Version | Supported | 8 | |---------|--------------------| 9 | | 2.5.x | ✅ Security fixes | 10 | | 2.4.x | ✅ Security fixes | 11 | | before 2.4.0 | ❌ No longer supported | 12 | 13 | ## Reporting a Vulnerability 14 | 15 | If you discover a security vulnerability in HAMi, we strongly encourage you to report it responsibly. Please **do not** disclose security vulnerabilities publicly without following our responsible disclosure process. 16 | 17 | ### How to Report 18 | - **GitHub Security Advisories**: [submit a private vulnerability report via GitHub](https://github.com/Project-HAMi/HAMi/security/advisories/new). 19 | - **Bug Bounty**: Currently, HAMi does not offer a public bug bounty program. 20 | 21 | ### Information to Include 22 | When reporting a security issue, please include: 23 | - A clear and concise description of the vulnerability. 24 | - Steps to reproduce the issue. 25 | - Any potential attack scenarios or security impact. 26 | - Suggested mitigations or fixes, if available. 27 | 28 | ## Response Process 29 | 30 | We follow a structured process to handle security reports: 31 | 32 | Response times could be affected by weekends, holidays, breaks or time zone differences. That said, the maintainers will endeavour to reply as soon as possible, ideally within 5 working days. 33 | 34 | 35 | ## Third-Party Dependencies 36 | 37 | HAMi relies on third-party libraries and containers. We monitor dependencies and promptly apply security patches. 38 | 39 | 40 | Thank you for helping us make HAMi more secure! 🔒 -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | v2.5.0 2 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking the vGPU scheduler 2 | 3 | ## Prerequisites 4 | 5 | ### how to build the benchmark image 6 | 7 | ```bash 8 | cd HAMi/benchmarks/ai-benchmark 9 | 10 | sh build.sh 11 | ``` 12 | 13 | ## How to install the official nvidia device plugin 14 | 15 | Please refer to [Quick Start](https://github.com/NVIDIA/k8s-device-plugin?tab=readme-ov-file#quick-start) in the official nvidia device plugin repository. 16 | 17 | ## Run the benchmark 18 | 19 | ```bash 20 | cd HAMi/benchmarks/deployments 21 | 22 | kubectl apply -f job-on-hami.yml 23 | 24 | kubectl apply -f job-on-nvidia-device-plugin.yml 25 | ``` -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/Dockerfile: -------------------------------------------------------------------------------- 1 | # This Dockerfile is used to build a Docker image for running the AI Benchmark. 2 | # It is based on the tensorflow/tensorflow:latest-gpu image. 3 | 4 | FROM tensorflow/tensorflow:latest-gpu 5 | 6 | # Set the working directory to /ai-benchmark 7 | WORKDIR ai-benchmark 8 | 9 | # Update the package list and install git and apt-utils 10 | RUN apt-get update && \ 11 | apt-get install -y --no-install-recommends apt-utils git && \ 12 | rm -rf /var/lib/apt/lists/* && \ 13 | pip install --no-cache-dir --upgrade pip && \ 14 | git clone https://github.com/Project-HAMi/ai-benchmark . && \ 15 | pip install --no-cache-dir -r requirements.txt 16 | 17 | # Set the default command to run when the container starts 18 | CMD ["python", "./main.py"] -------------------------------------------------------------------------------- /benchmarks/ai-benchmark/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | IMAGE="vgpu-benchmark" 5 | TAG="v0.0.1" 6 | PLATFORM="linux/amd64" 7 | 8 | docker buildx build --push \ 9 | --platform $PLATFORM \ 10 | --no-cache \ 11 | -t "$IMAGE:$TAG" \ 12 | -f Dockerfile . -------------------------------------------------------------------------------- /benchmarks/deployments/job-on-hami.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ai-benchmark-on-hami 5 | spec: 6 | template: 7 | metadata: 8 | name: ai-benchmark-on-hami 9 | spec: 10 | containers: 11 | - name: ai-benchmark-on-hami 12 | image: 4pdosc/ai-benchmark:2.4.1-gpu 13 | resources: 14 | requests: 15 | nvidia.com/gpu: 1 16 | nvidia.com/gpumem-percentage: 50 17 | limits: 18 | nvidia.com/gpu: 1 19 | nvidia.com/gpumem-percentage: 50 20 | restartPolicy: Never -------------------------------------------------------------------------------- /benchmarks/deployments/job-on-nvidia-device-plugin.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ai-benchmark-on-official 5 | spec: 6 | template: 7 | metadata: 8 | name: ai-benchmark-on-official 9 | spec: 10 | containers: 11 | - name: ai-benchmark-on-official 12 | image: 4pdosc/ai-benchmark:2.4.1-gpu 13 | resources: 14 | requests: 15 | nvidia.com/gpu: 1 16 | limits: 17 | nvidia.com/gpu: 1 18 | restartPolicy: Never -------------------------------------------------------------------------------- /charts/Makefile: -------------------------------------------------------------------------------- 1 | # get VERSION 2 | .DEFAULT_GOAL := all 3 | include ../Makefile.defs 4 | 5 | VERSION_REGEX := '[vV]*[0-9]\+\.[0-9]\+\.[0-9]\+.*' 6 | CHART_FILE := "./hami/Chart.yaml" 7 | VALUES_FILE := "./hami/values.yaml" 8 | 9 | .PHONY: all lint update-versions 10 | all: update-versions lint package 11 | 12 | #update version in chart 13 | update-versions: 14 | $(ECHO_GEN) " Updating Chart version to $(VERSION)" 15 | echo "VERSION=$(VERSION)" 16 | echo "VERSION_MAJOR=$(VERSION_MAJOR)" 17 | echo "GIT_VERSION=$(GIT_VERSION)" 18 | echo "FULL_BUILD_VERSION=$(FULL_BUILD_VERSION)" 19 | @# Update chart versions to point to the current version. 20 | hami_version="$(VERSION)"; \ 21 | chart_version=` echo $(VERSION) | tr -d 'v' ` ; \ 22 | sed -i 's/version: "*'$(VERSION_REGEX)'"*/version: '$$chart_version'/g' $(CHART_FILE); \ 23 | sed -i 's/appVersion: "*'$(VERSION_REGEX)'"*/appVersion: "'$$chart_version'"/g' $(CHART_FILE); \ 24 | sed -i 's/version: "*'$(VERSION_REGEX)'"*/version: "'$$hami_version'"/g' $(VALUES_FILE) 25 | 26 | lint: update-versions 27 | helm lint --with-subcharts --values ./hami/values.yaml ./hami --debug 28 | 29 | package: lint 30 | helm package ./hami --debug 31 | 32 | clean: 33 | rm -f *.tgz 34 | 35 | -------------------------------------------------------------------------------- /charts/hami/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: hami 3 | version: 2.5.0 4 | kubeVersion: ">= 1.18.0-0" 5 | description: Heterogeneous AI Computing Virtualization Middleware 6 | keywords: 7 | - vgpu 8 | - gpu 9 | type: application 10 | maintainers: 11 | - name: limengxuan 12 | email: archlitchi@gmail.com 13 | - name: zhangxiao 14 | email: xiaozhang0210@hotmail.com 15 | appVersion: "2.5.0" 16 | 17 | -------------------------------------------------------------------------------- /charts/hami/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | ** Please be patient while the chart is being deployed ** 2 | Resource name: {{ .Values.resourceName }} 3 | 4 | -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "hami-vgpu.device-plugin" . }} 5 | namespace: {{ include "hami-vgpu.namespace" . }} 6 | labels: 7 | app.kubernetes.io/component: hami-device-plugin 8 | {{- include "hami-vgpu.labels" . | nindent 4 }} 9 | data: 10 | config.json: | 11 | { 12 | "nodeconfig": [ 13 | { 14 | "name": "m5-cloudinfra-online02", 15 | "operatingmode": "hami-core", 16 | "devicememoryscaling": 1.8, 17 | "devicesplitcount": 10, 18 | "migstrategy":"none", 19 | "filterdevices": { 20 | "uuid": [], 21 | "index": [] 22 | } 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/monitorrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "hami-vgpu.device-plugin" . }}-monitor 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - pods 10 | verbs: 11 | - get 12 | - create 13 | - watch 14 | - list 15 | - update 16 | - patch 17 | - apiGroups: 18 | - "" 19 | resources: 20 | - nodes 21 | verbs: 22 | - get 23 | - update 24 | - list 25 | - patch 26 | 27 | 28 | -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/monitorrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "hami-vgpu.device-plugin" . }} 5 | labels: 6 | app.kubernetes.io/component: "hami-device-plugin" 7 | {{- include "hami-vgpu.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | #name: cluster-admin 12 | name: {{ include "hami-vgpu.device-plugin" . }}-monitor 13 | subjects: 14 | - kind: ServiceAccount 15 | name: {{ include "hami-vgpu.device-plugin" . }} 16 | namespace: {{ include "hami-vgpu.namespace" . }} 17 | -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/monitorservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "hami-vgpu.device-plugin" . }}-monitor 5 | namespace: {{ include "hami-vgpu.namespace" . }} 6 | labels: 7 | app.kubernetes.io/component: hami-device-plugin 8 | {{- include "hami-vgpu.labels" . | nindent 4 }} 9 | {{- if .Values.devicePlugin.service.labels }} # Use devicePlugin instead of scheduler 10 | {{ toYaml .Values.devicePlugin.service.labels | indent 4 }} 11 | {{- end }} 12 | {{- if .Values.devicePlugin.service.annotations }} # Use devicePlugin instead of scheduler 13 | annotations: {{ toYaml .Values.devicePlugin.service.annotations | nindent 4 }} 14 | {{- end }} 15 | spec: 16 | type: {{ .Values.devicePlugin.service.type | default "NodePort" }} # Default type is NodePort 17 | ports: 18 | - name: monitorport 19 | port: {{ .Values.devicePlugin.service.httpPort | default 31992 }} # Default HTTP port is 31992 20 | targetPort: 9394 21 | {{- if eq (.Values.devicePlugin.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort 22 | nodePort: {{ .Values.devicePlugin.service.httpPort | default 31992 }} 23 | {{- end }} 24 | protocol: TCP 25 | selector: 26 | app.kubernetes.io/component: hami-device-plugin 27 | {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/monitorserviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "hami-vgpu.device-plugin" . }} 5 | namespace: {{ include "hami-vgpu.namespace" . }} 6 | labels: 7 | app.kubernetes.io/component: "hami-device-plugin" 8 | {{- include "hami-vgpu.labels" . | nindent 4 }} 9 | -------------------------------------------------------------------------------- /charts/hami/templates/device-plugin/runtime-class.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }} 2 | apiVersion: node.k8s.io/v1 3 | kind: RuntimeClass 4 | metadata: 5 | name: {{ .Values.devicePlugin.runtimeClassName }} 6 | annotations: 7 | helm.sh/hook: pre-install,pre-upgrade 8 | handler: nvidia 9 | {{- end }} 10 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/certmanager.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.scheduler.certManager.enabled }} 2 | apiVersion: cert-manager.io/v1 3 | kind: Certificate 4 | metadata: 5 | name: {{ include "hami-vgpu.scheduler" . }}-serving-cert 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | labels: 8 | app.kubernetes.io/component: hami-scheduler 9 | {{- include "hami-vgpu.labels" . | nindent 4 }} 10 | spec: 11 | dnsNames: 12 | - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc 13 | - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local 14 | issuerRef: 15 | kind: Issuer 16 | name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer 17 | secretName: {{ include "hami-vgpu.scheduler.tls" . }} 18 | --- 19 | apiVersion: cert-manager.io/v1 20 | kind: Issuer 21 | metadata: 22 | name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer 23 | namespace: {{ include "hami-vgpu.namespace" . }} 24 | labels: 25 | app.kubernetes.io/component: hami-scheduler 26 | {{- include "hami-vgpu.labels" . | nindent 4 }} 27 | spec: 28 | selfSigned: {} 29 | {{- end }} 30 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/configmapnew.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.scheduler.kubeScheduler.enabled }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ include "hami-vgpu.scheduler" . }}-newversion 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | labels: 8 | app.kubernetes.io/component: hami-scheduler 9 | {{- include "hami-vgpu.labels" . | nindent 4 }} 10 | data: 11 | config.yaml: | 12 | {{- if gt (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 25}} 13 | apiVersion: kubescheduler.config.k8s.io/v1 14 | {{- else }} 15 | apiVersion: kubescheduler.config.k8s.io/v1beta2 16 | {{- end }} 17 | kind: KubeSchedulerConfiguration 18 | leaderElection: 19 | leaderElect: false 20 | profiles: 21 | - schedulerName: {{ .Values.schedulerName }} 22 | extenders: 23 | - urlPrefix: "https://127.0.0.1:443" 24 | filterVerb: filter 25 | bindVerb: bind 26 | nodeCacheCapable: true 27 | weight: 1 28 | httpTimeout: 30s 29 | enableHTTPS: true 30 | tlsConfig: 31 | insecure: true 32 | managedResources: 33 | - name: {{ .Values.resourceName }} 34 | ignoredByScheduler: true 35 | - name: {{ .Values.resourceMem }} 36 | ignoredByScheduler: true 37 | - name: {{ .Values.resourceCores }} 38 | ignoredByScheduler: true 39 | - name: {{ .Values.resourceMemPercentage }} 40 | ignoredByScheduler: true 41 | - name: {{ .Values.resourcePriority }} 42 | ignoredByScheduler: true 43 | - name: {{ .Values.mluResourceName }} 44 | ignoredByScheduler: true 45 | - name: {{ .Values.dcuResourceName }} 46 | ignoredByScheduler: true 47 | - name: {{ .Values.dcuResourceMem }} 48 | ignoredByScheduler: true 49 | - name: {{ .Values.dcuResourceCores }} 50 | ignoredByScheduler: true 51 | - name: {{ .Values.iluvatarResourceName }} 52 | ignoredByScheduler: true 53 | - name: "metax-tech.com/gpu" 54 | ignoredByScheduler: true 55 | - name: {{ .Values.metaxResourceName }} 56 | ignoredByScheduler: true 57 | - name: {{ .Values.metaxResourceCore }} 58 | ignoredByScheduler: true 59 | - name: {{ .Values.metaxResourceMem }} 60 | ignoredByScheduler: true 61 | {{- if .Values.devices.ascend.enabled }} 62 | {{- range .Values.devices.ascend.customresources }} 63 | - name: {{ . }} 64 | ignoredByScheduler: true 65 | {{- end }} 66 | {{- end }} 67 | {{- if .Values.devices.mthreads.enabled }} 68 | {{- range .Values.devices.mthreads.customresources }} 69 | - name: {{ . }} 70 | ignoredByScheduler: true 71 | {{- end }} 72 | {{- end }} 73 | {{- if .Values.devices.enflame.enabled }} 74 | {{- range .Values.devices.enflame.customresources }} 75 | - name: {{ . }} 76 | ignoredByScheduler: true 77 | {{- end }} 78 | {{- end }} 79 | {{- end }} 80 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission 6 | annotations: 7 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 8 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 9 | labels: 10 | {{- include "hami-vgpu.labels" . | nindent 4 }} 11 | app.kubernetes.io/component: admission-webhook 12 | rules: 13 | - apiGroups: 14 | - admissionregistration.k8s.io 15 | resources: 16 | #- validatingwebhookconfigurations 17 | - mutatingwebhookconfigurations 18 | verbs: 19 | - get 20 | - update 21 | {{- if .Values.podSecurityPolicy.enabled }} 22 | - apiGroups: ['extensions'] 23 | resources: ['podsecuritypolicies'] 24 | verbs: ['use'] 25 | resourceNames: 26 | - {{ include "hami-vgpu.fullname" . }}-admission 27 | {{- end }} 28 | {{- end }} -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission 6 | annotations: 7 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 8 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 9 | labels: 10 | {{- include "hami-vgpu.labels" . | nindent 4 }} 11 | app.kubernetes.io/component: admission-webhook 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: {{ include "hami-vgpu.fullname" . }}-admission 16 | subjects: 17 | - kind: ServiceAccount 18 | name: {{ include "hami-vgpu.fullname" . }}-admission 19 | namespace: {{ include "hami-vgpu.namespace" . }} 20 | {{- end }} 21 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: batch/v1 3 | kind: Job 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission-patch 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | annotations: 8 | "helm.sh/hook": post-install,post-upgrade 9 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 10 | labels: 11 | {{- include "hami-vgpu.labels" . | nindent 4 }} 12 | app.kubernetes.io/component: admission-webhook 13 | spec: 14 | {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} 15 | # Alpha feature since k8s 1.12 16 | ttlSecondsAfterFinished: 0 17 | {{- end }} 18 | template: 19 | metadata: 20 | name: {{ include "hami-vgpu.fullname" . }}-admission-patch 21 | {{- if .Values.scheduler.patch.podAnnotations }} 22 | annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} 23 | {{- end }} 24 | labels: 25 | {{- include "hami-vgpu.labels" . | nindent 8 }} 26 | app.kubernetes.io/component: admission-webhook 27 | hami.io/webhook: ignore 28 | spec: 29 | {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} 30 | {{- if .Values.scheduler.patch.priorityClassName }} 31 | priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} 32 | {{- end }} 33 | containers: 34 | - name: patch 35 | {{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }} 36 | image: {{ .Values.scheduler.patch.imageNew }} 37 | {{- else }} 38 | image: {{ .Values.scheduler.patch.image }} 39 | {{- end }} 40 | imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} 41 | args: 42 | - patch 43 | - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }} 44 | - --namespace={{ include "hami-vgpu.namespace" . }} 45 | - --patch-validating=false 46 | - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} 47 | restartPolicy: OnFailure 48 | serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission 49 | {{- if .Values.scheduler.patch.nodeSelector }} 50 | nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} 51 | {{- end }} 52 | {{- if .Values.scheduler.patch.tolerations }} 53 | tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }} 54 | {{- end }} 55 | securityContext: 56 | runAsNonRoot: true 57 | runAsUser: {{ .Values.scheduler.patch.runAsUser }} 58 | {{- end }} 59 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | {{- if .Values.podSecurityPolicy.enabled }} 3 | apiVersion: policy/v1beta1 4 | kind: PodSecurityPolicy 5 | metadata: 6 | name: {{ include "hami-vgpu.fullname" . }}-admission 7 | annotations: 8 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 9 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 10 | labels: 11 | {{- include "hami-vgpu.labels" . | nindent 4 }} 12 | app.kubernetes.io/component: admission-webhook 13 | spec: 14 | allowPrivilegeEscalation: false 15 | fsGroup: 16 | ranges: 17 | - max: 65535 18 | min: 1 19 | rule: MustRunAs 20 | requiredDropCapabilities: 21 | - ALL 22 | runAsUser: 23 | rule: MustRunAsNonRoot 24 | seLinux: 25 | rule: RunAsAny 26 | supplementalGroups: 27 | ranges: 28 | - max: 65535 29 | min: 1 30 | rule: MustRunAs 31 | volumes: 32 | - configMap 33 | - emptyDir 34 | - projected 35 | - secret 36 | - downwardAPI 37 | {{- end }} 38 | {{- end }} 39 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/role.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | annotations: 8 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 9 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 10 | labels: 11 | {{- include "hami-vgpu.labels" . | nindent 4 }} 12 | app.kubernetes.io/component: admission-webhook 13 | rules: 14 | - apiGroups: 15 | - "" 16 | resources: 17 | - secrets 18 | verbs: 19 | - get 20 | - create 21 | {{- end }} 22 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: RoleBinding 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | annotations: 8 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 9 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 10 | labels: 11 | {{- include "hami-vgpu.labels" . | nindent 4 }} 12 | app.kubernetes.io/component: admission-webhook 13 | roleRef: 14 | apiGroup: rbac.authorization.k8s.io 15 | kind: Role 16 | name: {{ include "hami-vgpu.fullname" . }}-admission 17 | subjects: 18 | - kind: ServiceAccount 19 | name: {{ include "hami-vgpu.fullname" . }}-admission 20 | namespace: {{ include "hami-vgpu.namespace" . }} 21 | {{- end }} 22 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/job-patch/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "hami-vgpu.fullname" . }}-admission 6 | namespace: {{ include "hami-vgpu.namespace" . }} 7 | annotations: 8 | "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade 9 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded 10 | labels: 11 | {{- include "hami-vgpu.labels" . | nindent 4 }} 12 | app.kubernetes.io/component: admission-webhook 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "hami-vgpu.scheduler" . }} 5 | labels: 6 | app.kubernetes.io/component: "hami-scheduler" 7 | {{- include "hami-vgpu.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: cluster-admin 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "hami-vgpu.scheduler" . }} 15 | namespace: {{ include "hami-vgpu.namespace" . }} 16 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "hami-vgpu.scheduler" . }} 5 | namespace: {{ include "hami-vgpu.namespace" . }} 6 | labels: 7 | app.kubernetes.io/component: hami-scheduler 8 | {{- include "hami-vgpu.labels" . | nindent 4 }} 9 | {{- if .Values.scheduler.service.labels }} 10 | {{ toYaml .Values.scheduler.service.labels | indent 4 }} 11 | {{- end }} 12 | {{- if .Values.scheduler.service.annotations }} 13 | annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} 14 | {{- end }} 15 | spec: 16 | type: {{ .Values.scheduler.service.type | default "NodePort" }} # Default type is NodePort 17 | ports: 18 | - name: http 19 | port: {{ .Values.scheduler.service.httpPort | default 443 }} # Default HTTP port is 443 20 | targetPort: {{ .Values.scheduler.service.httpTargetPort | default 443 }} 21 | {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort 22 | nodePort: {{ .Values.scheduler.service.schedulerPort | default 31998 }} 23 | {{- end }} 24 | protocol: TCP 25 | - name: monitor 26 | port: {{ .Values.scheduler.service.monitorPort | default 31993 }} # Default monitoring port is 31993 27 | targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }} 28 | {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort 29 | nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }} 30 | {{- end }} 31 | protocol: TCP 32 | selector: 33 | app.kubernetes.io/component: hami-scheduler 34 | {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "hami-vgpu.scheduler" . }} 5 | namespace: {{ include "hami-vgpu.namespace" . }} 6 | labels: 7 | app.kubernetes.io/component: "hami-scheduler" 8 | {{- include "hami-vgpu.labels" . | nindent 4 }} 9 | -------------------------------------------------------------------------------- /charts/hami/templates/scheduler/webhook.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: MutatingWebhookConfiguration 3 | metadata: 4 | {{- if .Values.scheduler.certManager.enabled }} 5 | annotations: 6 | cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert 7 | {{- end }} 8 | name: {{ include "hami-vgpu.scheduler.webhook" . }} 9 | webhooks: 10 | - admissionReviewVersions: 11 | - v1beta1 12 | clientConfig: 13 | {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} 14 | url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}} 15 | {{- else }} 16 | service: 17 | name: {{ include "hami-vgpu.scheduler" . }} 18 | namespace: {{ include "hami-vgpu.namespace" . }} 19 | path: /webhook 20 | port: {{ .Values.scheduler.service.httpPort }} 21 | {{- end }} 22 | failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }} 23 | matchPolicy: Equivalent 24 | name: vgpu.hami.io 25 | namespaceSelector: 26 | matchExpressions: 27 | - key: hami.io/webhook 28 | operator: NotIn 29 | values: 30 | - ignore 31 | {{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }} 32 | - key: kubernetes.io/metadata.name 33 | operator: NotIn 34 | values: 35 | {{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }} 36 | {{- end }} 37 | objectSelector: 38 | matchExpressions: 39 | - key: hami.io/webhook 40 | operator: NotIn 41 | values: 42 | - ignore 43 | reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }} 44 | rules: 45 | - apiGroups: 46 | - "" 47 | apiVersions: 48 | - v1 49 | operations: 50 | - CREATE 51 | resources: 52 | - pods 53 | scope: '*' 54 | sideEffects: None 55 | timeoutSeconds: 10 56 | -------------------------------------------------------------------------------- /cmd/device-plugin/nvidia/watchers.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "os" 21 | "os/signal" 22 | 23 | "github.com/fsnotify/fsnotify" 24 | ) 25 | 26 | func newFSWatcher(files ...string) (*fsnotify.Watcher, error) { 27 | watcher, err := fsnotify.NewWatcher() 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | for _, f := range files { 33 | err = watcher.Add(f) 34 | if err != nil { 35 | watcher.Close() 36 | return nil, err 37 | } 38 | } 39 | 40 | return watcher, nil 41 | } 42 | 43 | func newOSWatcher(sigs ...os.Signal) chan os.Signal { 44 | sigChan := make(chan os.Signal, 1) 45 | signal.Notify(sigChan, sigs...) 46 | 47 | return sigChan 48 | } 49 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto 17 | go build 18 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/noderpc/noderpc.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2015 gRPC authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | option go_package = "gitlab.4pd.io/vGPUmonitor"; 18 | option java_multiple_files = true; 19 | option java_package = "io.grpc.examples.helloworld"; 20 | option java_outer_classname = "HelloWorldProto"; 21 | 22 | package pluginrpc; 23 | 24 | // The greeting service definition. 25 | service NodeVGPUInfo { 26 | // Sends a greeting 27 | rpc GetNodeVGPU (GetNodeVGPURequest) returns (GetNodeVGPUReply) {} 28 | } 29 | 30 | // The sharedProcs contains the sharedRegion 31 | message shrregProcSlotT { 32 | int32 pid = 1; 33 | repeated uint64 used = 2; 34 | int32 status = 3; 35 | } 36 | 37 | // The sharedRegionT struct is the main struct for monitoring vgpu 38 | message sharedRegionT { 39 | int32 initializedFlag = 1; 40 | uint32 ownerPid = 2; 41 | uint32 sem = 3; 42 | repeated uint64 limit = 4; 43 | repeated uint64 sm_limit = 5; 44 | repeated shrregProcSlotT procs = 6; 45 | } 46 | 47 | message podusage { 48 | string poduuid = 1; 49 | sharedRegionT podvgpuinfo = 2; 50 | } 51 | 52 | // The request message containing the user's name. 53 | message GetNodeVGPURequest { 54 | string ctruuid = 1; 55 | } 56 | 57 | // The response message containing the greetings 58 | message GetNodeVGPUReply { 59 | string nodeid = 1; 60 | repeated podusage nodevgpuinfo = 2; 61 | } 62 | -------------------------------------------------------------------------------- /cmd/vGPUmonitor/validation.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | ) 23 | 24 | var requiredEnvVars = map[string]bool{ 25 | "HOOK_PATH": true, 26 | "OTHER_ENV_VAR": false, 27 | } 28 | 29 | func ValidateEnvVars() error { 30 | for envVar, required := range requiredEnvVars { 31 | _, exists := os.LookupEnv(envVar) 32 | if required && !exists { 33 | return fmt.Errorf("required environment variable %s not set", envVar) 34 | } 35 | } 36 | return nil 37 | } 38 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG GOLANG_IMAGE=golang:1.22.5-bullseye 2 | ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 3 | 4 | FROM $GOLANG_IMAGE AS build 5 | FROM $GOLANG_IMAGE AS gobuild 6 | ARG GOPROXY 7 | ARG VERSION 8 | ADD . /k8s-vgpu 9 | #RUN --mount=type=cache,target=/go/pkg/mod \ 10 | # cd /k8s-vgpu && make all 11 | RUN cd /k8s-vgpu && make all VERSION=$VERSION 12 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0 13 | 14 | FROM $NVIDIA_IMAGE AS nvbuild 15 | COPY ./libvgpu /libvgpu 16 | WORKDIR /libvgpu 17 | ENV DEBIAN_FRONTEND=noninteractive 18 | RUN apt-get -y update; apt-get -y install cmake 19 | RUN bash ./build.sh 20 | 21 | FROM nvidia/cuda:12.6.3-base-ubi8 22 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so* 23 | ENV NVIDIA_DISABLE_REQUIRE="true" 24 | ENV NVIDIA_VISIBLE_DEVICES=all 25 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility 26 | 27 | ARG VERSION 28 | LABEL version="$VERSION" 29 | LABEL maintainer="info@dynamia.ai" 30 | COPY ./LICENSE /k8s-vgpu/LICENSE 31 | COPY --from=gobuild /k8s-vgpu/bin /k8s-vgpu/bin 32 | COPY --from=gobuild /go/bin/nvidia-mig-parted /k8s-vgpu/bin/ 33 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh 34 | COPY ./lib /k8s-vgpu/lib 35 | COPY --from=nvbuild /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" 36 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh 37 | 38 | ENV PATH="/k8s-vgpu/bin:${PATH}" 39 | ARG DEST_DIR 40 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] 41 | -------------------------------------------------------------------------------- /docker/Dockerfile.hamicore: -------------------------------------------------------------------------------- 1 | ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 2 | 3 | FROM $NVIDIA_IMAGE AS nvbuild 4 | COPY ./libvgpu /libvgpu 5 | WORKDIR /libvgpu 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | RUN apt-get -y update; apt-get -y install cmake 8 | RUN bash ./build.sh 9 | 10 | FROM nvidia/cuda:12.6.3-base-ubi8 11 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so* 12 | ENV NVIDIA_DISABLE_REQUIRE="true" 13 | ENV NVIDIA_VISIBLE_DEVICES=all 14 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility 15 | 16 | ARG VERSION 17 | LABEL version="$VERSION" 18 | LABEL maintainer="projecthami@dynamia.ai" 19 | COPY --from=nvbuild /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" 20 | -------------------------------------------------------------------------------- /docker/Dockerfile.hamimaster: -------------------------------------------------------------------------------- 1 | ARG GOLANG_IMAGE 2 | ARG HAMICORE_IMAGE 3 | FROM $GOLANG_IMAGE AS build 4 | FROM $HAMICORE_IMAGE AS corebuild 5 | 6 | FROM $GOLANG_IMAGE AS GOBUILD 7 | ADD . /k8s-vgpu 8 | ARG VERSION 9 | RUN go env -w GO111MODULE=on 10 | RUN cd /k8s-vgpu && make all VERSION=$VERSION 11 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0 12 | 13 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04 14 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so* 15 | ENV NVIDIA_DISABLE_REQUIRE="true" 16 | ENV NVIDIA_VISIBLE_DEVICES=all 17 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility 18 | 19 | ARG VERSION 20 | LABEL version="$VERSION" 21 | LABEL maintainer="opensource@4paradigm.com" 22 | COPY ./LICENSE /k8s-vgpu/LICENSE 23 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin 24 | COPY --from=GOBUILD /go/bin/nvidia-mig-parted /k8s-vgpu/bin/ 25 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh 26 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh 27 | COPY ./lib /k8s-vgpu/lib 28 | COPY --from=corebuild /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" 29 | 30 | ENV PATH="/k8s-vgpu/bin:${PATH}" 31 | ARG DEST_DIR 32 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] 33 | -------------------------------------------------------------------------------- /docker/Dockerfile.withlib: -------------------------------------------------------------------------------- 1 | ARG GOLANG_IMAGE 2 | ARG NVIDIA_IMAGE 3 | FROM $GOLANG_IMAGE AS build 4 | 5 | FROM $GOLANG_IMAGE AS GOBUILD 6 | ADD . /k8s-vgpu 7 | ARG GOPROXY=https://goproxy.cn,direct 8 | ARG VERSION 9 | RUN go env -w GO111MODULE=on 10 | RUN cd /k8s-vgpu && make all VERSION=$VERSION 11 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0 12 | 13 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04 14 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so* 15 | ENV NVIDIA_DISABLE_REQUIRE="true" 16 | ENV NVIDIA_VISIBLE_DEVICES=all 17 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility 18 | 19 | ARG VERSION 20 | LABEL version="$VERSION" 21 | LABEL maintainer="info@dynamia.ai" 22 | COPY ./LICENSE /k8s-vgpu/LICENSE 23 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin 24 | COPY --from=GOBUILD /go/bin/nvidia-mig-parted /k8s-vgpu/bin/ 25 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh 26 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh 27 | COPY ./lib /k8s-vgpu/lib 28 | COPY ./libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" 29 | COPY ./license /k8s-vgpu/lib/nvidia/ 30 | COPY ./vgpuvalidator /k8s-vgpu/lib/nvidia 31 | 32 | ENV PATH="/k8s-vgpu/bin:${PATH}" 33 | ARG DEST_DIR 34 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] 35 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2024 The HAMi Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # if [ $1 == "device-plugin" ]; then 18 | # cp -f /k8s-vgpu/lib/* $DEST_DIR/vgpu 19 | # fi 20 | exec "$@" 21 | -------------------------------------------------------------------------------- /docker/vgpu-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Check if the destination directory is provided as an argument 4 | if [ -z "$1" ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Source directory 10 | SOURCE_DIR="/k8s-vgpu/lib/nvidia/" 11 | 12 | # Destination directory from the argument 13 | DEST_DIR="$1" 14 | 15 | 16 | # Check if the destination directory exists, create it if it doesn't 17 | if [ ! -d "$DEST_DIR" ]; then 18 | mkdir -p "$DEST_DIR" 19 | fi 20 | 21 | # Traverse all files in the source directory 22 | find "$SOURCE_DIR" -type f | while read -r source_file; do 23 | # Get the relative path of the source file 24 | relative_path="${source_file#$SOURCE_DIR}" 25 | 26 | # Construct the destination file path 27 | dest_file="$DEST_DIR$relative_path" 28 | 29 | # If the destination file doesn't exist, copy the source file 30 | if [ ! -f "$dest_file" ]; then 31 | # Create the parent directory of the destination file if it doesn't exist 32 | mkdir -p "$(dirname "$dest_file")" 33 | 34 | # Copy the file from source to destination 35 | cp "$source_file" "$dest_file" 36 | echo "Copied: $source_file -> $dest_file" 37 | else 38 | # Compare MD5 values of source and destination files 39 | source_md5=$(md5sum "$source_file" | cut -d ' ' -f 1) 40 | dest_md5=$(md5sum "$dest_file" | cut -d ' ' -f 1) 41 | 42 | # If MD5 values are different, copy the file 43 | if [ "$source_md5" != "$dest_md5" ]; then 44 | cp "$source_file" "$dest_file" 45 | echo "Copied: $source_file -> $dest_file" 46 | else 47 | echo "Skipped (same MD5): $source_file" 48 | fi 49 | fi 50 | done 51 | -------------------------------------------------------------------------------- /docs/CHANGELOG/CHANGELOG-0.0.0.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* 4 | 5 | - [v0.0.0](#v000) 6 | - [Downloads for v0.0.0](#downloads-for-v000) 7 | - [Changelog since v0.0.0](#changelog-since-v000) 8 | - [Changes by Kind](#changes-by-kind) 9 | - [Bug Fixes](#bug-fixes) 10 | - [Others](#others) 11 | 12 | 13 | 14 | # v0.0.0 15 | ## Downloads for v0.0.0 16 | 17 | Download v0.0.0 in the [v0.0.0 release page](https://github.com/Project-HAMi/HAMi/releases/tag/v0.0.0). 18 | 19 | ## Changelog since v0.0.0 20 | ### Changes by Kind 21 | #### Bug Fixes 22 | None. 23 | 24 | ### Deprecation 25 | None. 26 | 27 | #### Others 28 | None. 29 | 30 | -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | ## Benchmarks 2 | 3 | Three instances from ai-benchmark have been used to evaluate vGPU-device-plugin performance as follows: 4 | 5 | | Test Environment | description | 6 | | ---------------- | :------------------------------------------------------: | 7 | | Kubernetes version | v1.12.9 | 8 | | Docker version | 18.09.1 | 9 | | GPU Type | Tesla V100 | 10 | | GPU Num | 2 | 11 | 12 | | Test instance | description | 13 | | ------------- | :---------------------------------------------------------: | 14 | | nvidia-device-plugin | k8s + nvidia k8s-device-plugin | 15 | | vGPU-device-plugin | k8s + VGPU k8s-device-plugin,without virtual device memory | 16 | | vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,with virtual device memory | 17 | 18 | Test Cases: 19 | 20 | | test id | case | type | params | 21 | | ------- | :-----------: | :-------: | :---------------------: | 22 | | 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | 23 | | 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | 24 | | 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | 25 | | 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | 26 | | 3.1 | VGG-16 | inference | batch=20,size=224*224 | 27 | | 3.2 | VGG-16 | training | batch=2,size=224*224 | 28 | | 4.1 | DeepLab | inference | batch=2,size=512*512 | 29 | | 4.2 | DeepLab | training | batch=1,size=384*384 | 30 | | 5.1 | LSTM | inference | batch=100,size=1024*300 | 31 | | 5.2 | LSTM | training | batch=10,size=1024*300 | 32 | 33 | Test Result: ![img](../imgs/benchmark_inf.png) 34 | 35 | ![img](../imgs/benchmark_train.png) 36 | 37 | To reproduce: 38 | 39 | 1. install k8s-vGPU-scheduler, and configure properly 40 | 2. run benchmark job 41 | 42 | ``` 43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml 44 | ``` 45 | 46 | 3. View the result by using kubctl logs 47 | 48 | ``` 49 | $ kubectl logs [pod id] -------------------------------------------------------------------------------- /docs/benchmark_cn.md: -------------------------------------------------------------------------------- 1 | ## 性能测试 2 | 3 | 在测试报告中,我们一共在下面五种场景都执行了ai-benchmark 测试脚本,并汇总最终结果: 4 | 5 | | 测试环境 | 环境描述 | 6 | | ---------------- | :------------------------------------------------------: | 7 | | Kubernetes version | v1.12.9 | 8 | | Docker version | 18.09.1 | 9 | | GPU Type | Tesla V100 | 10 | | GPU Num | 2 | 11 | 12 | | 测试名称 | 测试用例 | 13 | | -------- | :------------------------------------------------: | 14 | | Nvidia-device-plugin | k8s + nvidia官方k8s-device-plugin | 15 | | vGPU-device-plugin | k8s + VGPU k8s-device-plugin,无虚拟显存 | 16 | | vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,高负载,开启虚拟显存 | 17 | 18 | 测试内容 19 | 20 | | test id | 名称 | 类型 | 参数 | 21 | | ------- | :-----------: | :-------: | :---------------------: | 22 | | 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | 23 | | 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | 24 | | 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | 25 | | 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | 26 | | 3.1 | VGG-16 | inference | batch=20,size=224*224 | 27 | | 3.2 | VGG-16 | training | batch=2,size=224*224 | 28 | | 4.1 | DeepLab | inference | batch=2,size=512*512 | 29 | | 4.2 | DeepLab | training | batch=1,size=384*384 | 30 | | 5.1 | LSTM | inference | batch=100,size=1024*300 | 31 | | 5.2 | LSTM | training | batch=10,size=1024*300 | 32 | 33 | 测试结果: ![img](../imgs/benchmark_inf.png) 34 | 35 | ![img](../imgs/benchmark_train.png) 36 | 37 | 测试步骤: 38 | 39 | 1. 安装nvidia-device-plugin,并配置相应的参数 40 | 2. 运行benchmark任务 41 | 42 | ``` 43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml 44 | ``` 45 | 46 | 3. 通过kubctl logs 查看结果 47 | 48 | ``` 49 | $ kubectl logs [pod id] 50 | ``` -------------------------------------------------------------------------------- /docs/cambricon-mlu-support_cn.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本组件支持复用寒武纪MLU设备,并为此提供以下几种与vGPU类似的复用功能,包括: 4 | 5 | ***MLU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 6 | 7 | ***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值 8 | 9 | ***可限制分配的算力大小***: 你现在可以用百分比来分配MLU的算力,本组件会确保任务使用的算力不会超过分配数值 10 | 11 | ***指定MLU型号***:当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式,来选择使用或者不使用某些具体型号的MLU 12 | 13 | ## 节点需求 14 | 15 | * neuware-mlu370-driver > 5.10 16 | * cntoolkit > 2.5.3 17 | 18 | ## 开启MLU复用 19 | 20 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/Project-HAMi/HAMi/blob/master/README_cn.md#kubernetes开启vgpu支持) 21 | 22 | * 使用以下指令,为MLU节点打上label 23 | ``` 24 | kubectl label node {mlu-node} mlu=on 25 | ``` 26 | 27 | * 从您的设备提供商处获取cambricon-device-plugin,并配置以下两个参数: 28 | 29 | `mode=dynamic-smlu`, `min-dsmlu-unit=256` 30 | 31 | 它们分别代表开启MLU复用功能,与设置最小可分配的内存单元为256M,您可以参考设备提供方的文档来获取更多的配置信息。 32 | 33 | * 部署配置后的`cambricon-device-plugin` 34 | 35 | ``` 36 | kubectl apply -f cambricon-device-plugin-daemonset.yaml 37 | ``` 38 | 39 | 40 | ## 运行MLU任务 41 | 42 | ```yaml 43 | apiVersion: apps/v1 44 | kind: Deployment 45 | metadata: 46 | name: binpack-1 47 | labels: 48 | app: binpack-1 49 | spec: 50 | replicas: 1 51 | selector: 52 | matchLabels: 53 | app: binpack-1 54 | template: 55 | metadata: 56 | labels: 57 | app: binpack-1 58 | spec: 59 | containers: 60 | - name: c-1 61 | image: ubuntu:18.04 62 | command: ["sleep"] 63 | args: ["100000"] 64 | resources: 65 | limits: 66 | cambricon.com/vmlu: "1" 67 | cambricon.com/mlu.smlu.vmemory: "20" 68 | cambricon.com/mlu.smlu.vcore: "10" 69 | ``` 70 | 71 | ## 注意事项 72 | 73 | 1. 在init container中无法使用MLU复用功能,否则该任务不会被调度 74 | 75 | 2. 只有申请单MLU的任务可以指定显存`mlu.smlu.vmemory`和算力`mlu.smlu.vcore`的数值,若申请的MLU数量大于1,则所有申请的MLU都会被整卡分配 76 | -------------------------------------------------------------------------------- /docs/develop/design.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | 4 | 5 | The architect of HAMi is shown in the figure above, It is organized in the form of "chart". 6 | 7 | - MutatingWebhook 8 | 9 | The MutatingWebhook checks the validity of each task, and set the "schedulerName" to "HAMi scheduler" if the resource requests have been recognized by HAMi 10 | If Not, the MutatingWebhook does nothing and pass this task to default-scheduler. 11 | 12 | - Scheduler 13 | 14 | HAMi support default kube-scheduler and volcano-scheduler, it implements an extender and register 'Filter' and 'Score' methods to deal with sharable devices. 15 | When a pod with sharable device request arrives, 'Filter' searches the cluster and returns a list of 'available' nodes. 'Score' scores each node 'Filter' returned, and pick the highest one to host the pod. It patches the schedule decision on corresponding pod annotations, for the detailed protocol, see [protocol.md](protocol.md) 16 | 17 | - DevicePlugin 18 | 19 | When the schedule decision is made, scheduler calls devicePlugin on that node to generate environment variables and mounts according to pod annotations. 20 | Please note that, the DP used here is a customized version, you need to install according to [README](../../README.md) document with that device. Most officaial DP will not fit in HAMi, and will result in unexpected behaviour 21 | 22 | - InContainer Control 23 | 24 | The implementation of in-container hard limit is different for diffent devices. For example, HAMi-Core is responsible for NVIDIA devices. libvgpu-control.so is responsible for iluvatar devices, etc. HAMi needs to pass the correct environment variables in order for it to operate. 25 | 26 | 27 | 28 | In summary, The flowchart of pod is descirbed as the figure above. 29 | -------------------------------------------------------------------------------- /docs/develop/imgs/flowchart.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/flowchart.jpeg -------------------------------------------------------------------------------- /docs/develop/imgs/gpu-scheduler-policy-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/gpu-scheduler-policy-demo.png -------------------------------------------------------------------------------- /docs/develop/imgs/hami-dynamic-mig-procedure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/hami-dynamic-mig-procedure.png -------------------------------------------------------------------------------- /docs/develop/imgs/hami-dynamic-mig-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/hami-dynamic-mig-structure.png -------------------------------------------------------------------------------- /docs/develop/imgs/node-shceduler-policy-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/node-shceduler-policy-demo.png -------------------------------------------------------------------------------- /docs/develop/imgs/offline_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/offline_validation.png -------------------------------------------------------------------------------- /docs/develop/imgs/protocol_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/protocol_pod.png -------------------------------------------------------------------------------- /docs/develop/imgs/protocol_register.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/protocol_register.png -------------------------------------------------------------------------------- /docs/develop/imgs/scheduler-policy-story.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/scheduler-policy-story.png -------------------------------------------------------------------------------- /docs/develop/roadmap.md: -------------------------------------------------------------------------------- 1 | # roadmap 2 | 3 | Heterogeneous AI Computing device to support 4 | 5 | | Production | manufactor | Type |MemoryIsolation | CoreIsolation | MultiCard support | 6 | |-------------|------------|-------------|-----------|---------------|-------------------| 7 | | GPU | NVIDIA | All | ✅ | ✅ | ✅ | 8 | | MLU | Cambricon | 370, 590 | ✅ | ✅ | ❌ | 9 | | GCU | Enflame | S60 | ✅ | ✅ | ❌ | 10 | | DCU | Hygon | Z100, Z100L | ✅ | ✅ | ❌ | 11 | | Ascend | Huawei | 910B | ✅ | ✅ | ❌ | 12 | | GPU | iluvatar | All | ✅ | ✅ | ❌ | 13 | | DPU | Teco | Checking | In progress | In progress | ❌ | 14 | 15 | 16 | - [ ] Support video codec processing 17 | - [ ] Support Multi-Instance GPUs (MIG) 18 | - [ ] Support Flexible scheduling policies 19 | - [x] binpack 20 | - [x] spread 21 | - [ ] numa affinity 22 | - [ ] integrated gpu-operator 23 | - [ ] Rich observability support 24 | - [ ] DRA Support 25 | - [ ] Support Intel GPU device 26 | - [ ] Support AMD GPU device 27 | - [x] Support Enflame GCU device 28 | -------------------------------------------------------------------------------- /docs/hygon-dcu-support.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | **We now support hygon.com/dcu by implementing most device-sharing features as nvidia-GPU**, including: 4 | 5 | ***DCU sharing***: Each task can allocate a portion of DCU instead of a whole DCU card, thus DCU can be shared among multiple tasks. 6 | 7 | ***Device Memory Control***: DCUs can be allocated with certain device memory size on certain type(i.e Z100) and have made it that it does not exceed the boundary. 8 | 9 | ***Device compute core limitation***: DCUs can be allocated with certain percentage of device core(i.e hygon.com/dcucores:60 indicate this container uses 60% compute cores of this device) 10 | 11 | ***DCU Type Specification***: You can specify which type of DCU to use or to avoid for a certain task, by setting "hygon.com/use-dcutype" or "hygon.com/nouse-dcutype" annotations. 12 | 13 | ## Prerequisites 14 | 15 | * dtk driver >= 24.04 16 | * hy-smi v1.6.0 17 | 18 | ## Enabling DCU-sharing Support 19 | 20 | * Deploy the dcu-vgpu-device-plugin [here](https://github.com/Project-HAMi/dcu-vgpu-device-plugin) 21 | 22 | 23 | ## Running DCU jobs 24 | 25 | Hygon DCUs can now be requested by a container 26 | using the `hygon.com/dcunum` , `hygon.com/dcumem` and `hygon.com/dcucores` resource type: 27 | 28 | ```yaml 29 | apiVersion: v1 30 | kind: Pod 31 | metadata: 32 | name: alexnet-tf-gpu-pod-mem 33 | labels: 34 | purpose: demo-tf-amdgpu 35 | spec: 36 | containers: 37 | - name: alexnet-tf-gpu-container 38 | image: pytorch:resnet50 39 | workingDir: /root 40 | command: ["sleep","infinity"] 41 | resources: 42 | limits: 43 | hygon.com/dcunum: 1 # requesting a GPU 44 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 45 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 46 | 47 | ``` 48 | 49 | ## Enable vDCU inside container 50 | 51 | You need to enable vDCU inside container in order to use it. 52 | ``` 53 | source /opt/hygondriver/env.sh 54 | ``` 55 | 56 | check if you have successfully enabled vDCU by using following command 57 | 58 | ``` 59 | hy-virtual -show-device-info 60 | ``` 61 | 62 | If you have an output like this, then you have successfully enabled vDCU inside container. 63 | 64 | ``` 65 | Device 0: 66 | Actual Device: 0 67 | Compute units: 60 68 | Global memory: 2097152000 bytes 69 | ``` 70 | 71 | Launch your DCU tasks like you usually do 72 | 73 | ## Notes 74 | 75 | 1. DCU-sharing in init container is not supported, pods with "hygon.com/dcumem" in init container will never be scheduled. 76 | 77 | 2. Only one vdcu can be aquired per container. If you want to mount multiple dcu devices, then you shouldn't set `hygon.com/dcumem` or `hygon.com/dcucores` 78 | -------------------------------------------------------------------------------- /docs/hygon-dcu-support_cn.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本组件支持复用海光DCU设备,并为此提供以下几种与vGPU类似的复用功能,包括: 4 | 5 | ***DCU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 6 | 7 | ***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配DCU,本组件会确保任务使用的显存不会超过分配数值 8 | 9 | ***可限制计算单元数量***: 你现在可以指定任务使用的算力比例(例如60即代表使用60%算力)来分配DCU,本组件会确保任务使用的算力不会超过分配数值 10 | 11 | ***指定DCU型号***:当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式,来选择使用或者不使用某些具体型号的DCU 12 | 13 | ## 节点需求 14 | 15 | * dtk driver >= 24.04 16 | * hy-smi v1.6.0 17 | 18 | ## 开启DCU复用 19 | 20 | * 部署[dcu-vgpu-device-plugin](https://github.com/Project-HAMi/dcu-vgpu-device-plugin) 21 | 22 | ## 运行DCU任务 23 | 24 | ```yaml 25 | apiVersion: v1 26 | kind: Pod 27 | metadata: 28 | name: alexnet-tf-gpu-pod-mem 29 | labels: 30 | purpose: demo-tf-amdgpu 31 | spec: 32 | containers: 33 | - name: alexnet-tf-gpu-container 34 | image: pytorch:resnet50 35 | workingDir: /root 36 | command: ["sleep","infinity"] 37 | resources: 38 | limits: 39 | hygon.com/dcunum: 1 # requesting a GPU 40 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 41 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 42 | 43 | ``` 44 | 45 | ## 容器内开启虚拟DCU功能 46 | 47 | 使用vDCU首先需要激活虚拟环境 48 | ``` 49 | source /opt/hygondriver/env.sh 50 | ``` 51 | 52 | 随后,使用hdmcli指令查看虚拟设备是否已经激活 53 | ``` 54 | hy-virtual -show-device-info 55 | ``` 56 | 57 | 若输出如下,则代表虚拟设备已经成功激活 58 | ``` 59 | Device 0: 60 | Actual Device: 0 61 | Compute units: 60 62 | Global memory: 2097152000 bytes 63 | ``` 64 | 65 | 接下来正常启动DCU任务即可 66 | 67 | ## 注意事项 68 | 69 | 1. 在init container中无法使用DCU复用功能,否则该任务不会被调度 70 | 71 | 2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备,则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段 72 | -------------------------------------------------------------------------------- /docs/mind-map/HAMI-VGPU-mind-map-Chinese.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-Chinese.png -------------------------------------------------------------------------------- /docs/mind-map/HAMI-VGPU-mind-map-Chinese.xmind: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-Chinese.xmind -------------------------------------------------------------------------------- /docs/mind-map/HAMI-VGPU-mind-map-English.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-English.png -------------------------------------------------------------------------------- /docs/mind-map/HAMI-VGPU-mind-map-English.xmind: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-English.xmind -------------------------------------------------------------------------------- /docs/mind-map/readme: -------------------------------------------------------------------------------- 1 | - 根据交流群里各位大佬的交流梳理此份思维导图(尤其感谢 @意琦行 大佬梳理的博客) 2 | - 英文版由 @隽戈 大佬提供 3 | - 若有问题处,各位大佬可随时提出 4 | 5 | -Based on the communication among the experts in the communication group, this mind map has been compiled (especially thanks to @意琦行 expert for organizing the blog) 6 | -The English version is provided by @隽戈 7 | -If there are any issues, anyone can raise them at any time 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/mthreads-support.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | **We now support mthreads.com/vgpu by implementing most device-sharing features as nvidia-GPU**, including: 4 | 5 | ***GPU sharing***: Each task can allocate a portion of GPU instead of a whole GPU card, thus GPU can be shared among multiple tasks. 6 | 7 | ***Device Memory Control***: GPUs can be allocated with certain device memory size on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary. 8 | 9 | ***Device Core Control***: GPUs can be allocated with limited compute cores on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary. 10 | 11 | ## Important Notes 12 | 13 | 1. Device sharing for multi-cards is not supported. 14 | 15 | 2. Only one mthreads device can be shared in a pod(even there are multiple containers). 16 | 17 | 3. Support allocating exclusive mthreads GPU by specifying mthreads.com/vgpu only. 18 | 19 | 4. These features are tested on MTT S4000 20 | 21 | ## Prerequisites 22 | 23 | * [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/) 24 | * driver version >= 1.2.0 25 | 26 | ## Enabling GPU-sharing Support 27 | 28 | * Deploy MT-CloudNative Toolkit on mthreads nodes (Please consult your device provider to aquire its package and document) 29 | 30 | > **NOTICE:** *You can remove mt-mutating-webhook and mt-gpu-scheduler after installation(optional).* 31 | 32 | * set the 'devices.mthreads.enabled = true' when installing hami 33 | 34 | ``` 35 | helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system 36 | ``` 37 | 38 | ## Running Mthreads jobs 39 | 40 | Mthreads GPUs can now be requested by a container 41 | using the `mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core` resource type: 42 | 43 | ```yaml 44 | apiVersion: v1 45 | kind: Pod 46 | metadata: 47 | name: gpushare-pod-default 48 | spec: 49 | restartPolicy: OnFailure 50 | containers: 51 | - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 52 | imagePullPolicy: IfNotPresent 53 | name: gpushare-pod-1 54 | command: ["sleep"] 55 | args: ["100000"] 56 | resources: 57 | limits: 58 | mthreads.com/vgpu: 1 59 | mthreads.com/sgpu-memory: 32 60 | mthreads.com/sgpu-core: 8 61 | ``` 62 | 63 | > **NOTICE1:** *Each unit of sgpu-memory indicates 512M device memory* 64 | 65 | > **NOTICE2:** *You can find more examples in [examples/mthreads folder](../examples/mthreads/)* 66 | -------------------------------------------------------------------------------- /docs/mthreads-support_cn.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本组件支持复用摩尔线程GPU设备,并为此提供以下几种与vGPU类似的复用功能,包括: 4 | 5 | ***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 6 | 7 | ***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值、 8 | 9 | ***可限制分配的算力核组比例***: 你现在可以用算力核组数量(例如8个)来分配GPU,本组件会确保任务使用的显存不会超过分配数值 10 | 11 | ## 注意事项 12 | 13 | 1. 暂时不支持多卡切片,多卡任务只能分配整卡 14 | 15 | 2. 一个pod只能使用一个GPU生成的切片,即使该pod中有多个容器 16 | 17 | 3. 支持独占模式,只指定`mthreads.com/vgpu`即为独占申请 18 | 19 | 4. 本特性目前只支持MTT S4000设备 20 | 21 | ## 节点需求 22 | 23 | * [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/) 24 | * 驱动版本 >= 1.2.0 25 | 26 | ## 开启GPU复用 27 | 28 | * 部署'gpu-manager',天数智芯的GPU共享需要配合厂家提供的'MT-CloudNative Toolkit'一起使用,请联系设备提供方获取 29 | 30 | > **注意:** *(可选),部署完之后,卸载掉mt-mutating-webhook与mt-scheduler组件,因为这部分功能将由HAMi调度器提供* 31 | 32 | * 在安装HAMi时配置'devices.mthreads.enabled = true'参数 33 | 34 | ``` 35 | helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system 36 | ``` 37 | 38 | ## 运行GPU任务 39 | 40 | 通过指定`mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core`这3个参数,可以确定容器申请的切片个数,对应的显存和算力核组 41 | 42 | ```yaml 43 | apiVersion: v1 44 | kind: Pod 45 | metadata: 46 | name: gpushare-pod-default 47 | spec: 48 | restartPolicy: OnFailure 49 | containers: 50 | - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 51 | imagePullPolicy: IfNotPresent 52 | name: gpushare-pod-1 53 | command: ["sleep"] 54 | args: ["100000"] 55 | resources: 56 | limits: 57 | mthreads.com/vgpu: 1 58 | mthreads.com/sgpu-memory: 32 59 | mthreads.com/sgpu-core: 8 60 | ``` 61 | 62 | > **注意1:** *每一单位的sgpu-memory代表512M的显存.* 63 | 64 | > **注意2:** *查看更多的[用例](../examples/mthreads/).* 65 | -------------------------------------------------------------------------------- /docs/offline-install.md: -------------------------------------------------------------------------------- 1 | # Offline-install Maunal 2 | 3 | For some cluster that don't have external web access, you can install HAMi by the following step: 4 | 5 | 1. Refer to [README.md](../README.md) until step 'Install and Uninstall' 6 | 7 | 2. pull the following images and save them into a '.tar' file, then move it into your cluster 8 | 9 | Image list: 10 | ``` 11 | projecthami/hami:{HAMi version} 12 | docker.io/jettech/kube-webhook-certgen:v1.5.2 13 | liangjw/kube-webhook-certgen:v1.1.1 14 | registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} 15 | ``` 16 | 17 | ``` 18 | docker pull {iamge} && docker save {image_name} -o {image_name}.tar 19 | ``` 20 | 21 | 3. Load these images using docker load, tag these images with your registry, and push them into your registry 22 | 23 | ``` 24 | docker load -i {HAMi_image}.tar 25 | docker tag projecthami/hami:{HAMi version} {your_inner_registry}/hami:{HAMi version} 26 | docker push {your_inner_registry}/hami:{HAMi version} 27 | docker tag docker.io/jettech/kube-webhook-certgen:v1.5.2 {your inner_regisry}/kube-webhook-certgen:v1.5.2 28 | docker push {your inner_regisry}/kube-webhook-certgen:v1.5.2 29 | docker tag liangjw/kube-webhook-certgen:v1.1.1 {your_inner_registry}/kube-webhook-certgen:v1.1.1 30 | docker tag registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} {your_inner_registry}/kube-scheduler:{your kubernetes version} 31 | docker push {your_inner_registry}/kube-scheduler:{your kubernetes version} 32 | ``` 33 | 34 | 4. Download the charts folder from [github](https://github.com/Project-HAMi/HAMi/tree/master/charts), place it into ${CHART_PATH} inside cluser, then edit the following fields in ${CHART_PATH}/hami/values.yaml. 35 | 36 | ``` 37 | scheduler.kubeScheduler.image 38 | scheduler.extender.image 39 | scheduler.patch.image 40 | scheduler.patch.imageNew 41 | scheduler.devicePlugin.image 42 | scheduler.devicePlugin.monitorimage 43 | ``` 44 | 45 | 5. Execute the following command in your /root/HAMi/chart folder 46 | 47 | ``` 48 | helm install hami hami --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system 49 | ``` 50 | 51 | 6. Verify your installation 52 | 53 | execute the following command 54 | ``` 55 | kubectl get pods -n kube-system 56 | ``` 57 | 58 | If you can see both the 'device-plugin' and 'schduler' running, then HAMi is installed successfully, as the figure shown below: 59 | 60 | 61 | -------------------------------------------------------------------------------- /docs/proposals/e2e_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/proposals/e2e_test.png -------------------------------------------------------------------------------- /docs/proposals/gpu_utilization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/proposals/gpu_utilization.png -------------------------------------------------------------------------------- /example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | kubernetes.io/metadata.name: gpu-test-workloads 6 | pod-security.kubernetes.io/enforce: privileged 7 | name: gpu-test-workloads 8 | --- 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: cuda-sample-vector-add 13 | namespace: gpu-test-workloads 14 | labels: 15 | app: cuda-sample-vector-add 16 | spec: 17 | replicas: 1 18 | selector: 19 | matchLabels: 20 | app: cuda-sample-vector-add 21 | template: 22 | metadata: 23 | labels: 24 | app: cuda-sample-vector-add 25 | spec: 26 | containers: 27 | - name: cuda-sample-vector-add 28 | image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04 29 | command: 30 | - /bin/bash 31 | - '-c' 32 | - '--' 33 | args: 34 | - while true; do /cuda-samples/vectorAdd; done 35 | resources: 36 | limits: 37 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 38 | nvidia.com/gpumem: 3000 # Each vGPU contains 3000M device memory (Optional,Integer) 39 | terminationMessagePath: /dev/termination-log 40 | terminationMessagePolicy: File 41 | imagePullPolicy: IfNotPresent 42 | restartPolicy: Always 43 | terminationGracePeriodSeconds: 30 44 | dnsPolicy: ClusterFirst 45 | hostPID: true 46 | securityContext: {} 47 | schedulerName: default-scheduler 48 | tolerations: 49 | - key: nvidia.com/gpu 50 | operator: Exists 51 | effect: NoSchedule 52 | priorityClassName: system-cluster-critical 53 | strategy: 54 | type: RollingUpdate 55 | rollingUpdate: 56 | maxUnavailable: 25% 57 | maxSurge: 25% 58 | revisionHistoryLimit: 10 59 | progressDeadlineSeconds: 600 -------------------------------------------------------------------------------- /examples/ascend/job-310P.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend310p-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend310P: 1 # requesting 1 NPU 13 | huawei.com/Ascend310P-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /examples/ascend/job-910A.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend910a-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend910A: 1 # requesting 1 NPU 13 | huawei.com/Ascend910A-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /examples/ascend/job-910B2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend910b2-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend910B2: 1 # requesting 1 NPU 13 | huawei.com/Ascend910B2-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /examples/ascend/job-910B3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend910b-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend910B: 1 # requesting 1 NPU 13 | huawei.com/Ascend910B-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /examples/ascend/job-910B4.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ascend910b4-job 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | huawei.com/Ascend910B4: 1 # requesting 1 NPU 13 | huawei.com/Ascend910B4-memory: 2000 # requesting 2000m device memory -------------------------------------------------------------------------------- /examples/enflame/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gcushare-pod-2 5 | namespace: kube-system 6 | spec: 7 | terminationGracePeriodSeconds: 0 8 | containers: 9 | - name: pod-gcu-example1 10 | image: ubuntu:18.04 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - sleep 14 | args: 15 | - '100000' 16 | resources: 17 | limits: 18 | enflame.com/vgcu: 1 19 | enflame.com/vgcu-percentage: 22 -------------------------------------------------------------------------------- /examples/enflame/use_exclusive.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gcushare-pod-4 5 | namespace: kube-system 6 | spec: 7 | terminationGracePeriodSeconds: 0 8 | containers: 9 | - name: pod-gcu-example3 10 | image: ubuntu:18.04 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - sleep 14 | args: 15 | - '100000' 16 | resources: 17 | limits: 18 | enflame.com/vgcu: 1 -------------------------------------------------------------------------------- /examples/hygon/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | labels: 6 | purpose: demo-tf-amdgpu 7 | spec: 8 | containers: 9 | - name: alexnet-tf-gpu-container 10 | image: pytorch:resnet50 11 | workingDir: /root 12 | command: ["sleep","infinity"] 13 | resources: 14 | limits: 15 | hygon.com/dcunum: 1 # requesting a GPU 16 | hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory 17 | hygon.com/dcucores: 60 # each dcu use 60% of total compute cores 18 | -------------------------------------------------------------------------------- /examples/hygon/specify_card_type_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | annotations: 6 | hygon.com/nouse-dcutype: "Z100L" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we don't want this container to run on Z100L 8 | purpose: demo-tf-amdgpu 9 | spec: 10 | containers: 11 | - name: alexnet-tf-gpu-container 12 | image: pytorch:resnet50 13 | workingDir: /root 14 | command: ["sleep","infinity"] 15 | resources: 16 | limits: 17 | hygon.com/dcunum: 1 # requesting a GPU 18 | hygon.com/dcumem: 2000 19 | hygon.com/dcucores: 60 20 | -------------------------------------------------------------------------------- /examples/hygon/specify_card_type_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: alexnet-tf-gpu-pod-mem 5 | annotations: 6 | hygon.com/use-dcutype: "Z100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card 7 | #In this example, we want to run this job on Z100 8 | labels: 9 | purpose: demo-tf-amdgpu 10 | spec: 11 | containers: 12 | - name: alexnet-tf-gpu-container 13 | image: pytorch:resnet50 14 | workingDir: /root 15 | command: ["sleep","infinity"] 16 | resources: 17 | limits: 18 | hygon.com/dcunum: 1 # requesting a GPU 19 | hygon.com/dcumem: 2000 20 | hygon.com/dcucores: 60 21 | -------------------------------------------------------------------------------- /examples/iluvatar/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: poddemo 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: poddemo 9 | image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e 10 | command: 11 | - bash 12 | args: 13 | - -c 14 | - | 15 | set -ex 16 | echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc 17 | cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ 18 | cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ 19 | source /root/.bashrc 20 | sleep 360000 21 | resources: 22 | requests: 23 | iluvatar.ai/vgpu: 1 24 | iluvatar.ai/vcuda-core: 50 25 | iluvatar.ai/vcuda-memory: 64 26 | limits: 27 | iluvatar.ai/vgpu: 1 28 | iluvatar.ai/vcuda-core: 50 29 | iluvatar.ai/vcuda-memory: 64 -------------------------------------------------------------------------------- /examples/iluvatar/multi-containers.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: poddemo 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: poddemo 9 | image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e 10 | command: 11 | - bash 12 | args: 13 | - -c 14 | - | 15 | set -ex 16 | echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc 17 | cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ 18 | cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ 19 | source /root/.bashrc 20 | sleep 360000 21 | resources: 22 | requests: 23 | iluvatar.ai/vgpu: 1 24 | iluvatar.ai/vcuda-core: 50 25 | iluvatar.ai/vcuda-memory: 64 26 | limits: 27 | iluvatar.ai/vgpu: 1 28 | iluvatar.ai/vcuda-core: 50 29 | iluvatar.ai/vcuda-memory: 64 30 | - name: poddemo1 31 | image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e 32 | command: 33 | - bash 34 | args: 35 | - -c 36 | - | 37 | set -ex 38 | echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc 39 | cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ 40 | cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ 41 | source /root/.bashrc 42 | sleep 360000 43 | resources: 44 | requests: 45 | iluvatar.ai/vgpu: 1 46 | iluvatar.ai/vcuda-core: 50 47 | iluvatar.ai/vcuda-memory: 64 48 | limits: 49 | iluvatar.ai/vgpu: 1 50 | iluvatar.ai/vcuda-core: 50 51 | iluvatar.ai/vcuda-memory: 64 -------------------------------------------------------------------------------- /examples/iluvatar/multi-devices.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: poddemo 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: poddemo 9 | image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e 10 | command: 11 | - bash 12 | args: 13 | - -c 14 | - | 15 | set -ex 16 | echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc 17 | cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ 18 | cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ 19 | source /root/.bashrc 20 | sleep 360000 21 | resources: 22 | requests: 23 | iluvatar.ai/vgpu: 2 24 | limits: 25 | iluvatar.ai/vgpu: 2 -------------------------------------------------------------------------------- /examples/metax/gpu/binpack.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod1 5 | annotations: 6 | hami.io/node-scheduler-policy: "binpack" # when this parameter is set to binpack, the scheduler will try to minimize the topology loss. 7 | spec: 8 | containers: 9 | - name: ubuntu-container 10 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 11 | imagePullPolicy: IfNotPresent 12 | command: ["sleep","infinity"] 13 | resources: 14 | limits: 15 | metax-tech.com/gpu: 1 # requesting 1 vGPUs -------------------------------------------------------------------------------- /examples/metax/gpu/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod1 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep","infinity"] 11 | resources: 12 | limits: 13 | metax-tech.com/gpu: 1 # requesting 1 vGPUs -------------------------------------------------------------------------------- /examples/metax/gpu/spread.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod1 5 | annotations: 6 | hami.io/node-scheduler-policy: "spread" # when this parameter is set to spread, the scheduler will try to find the best topology for this task. 7 | spec: 8 | containers: 9 | - name: ubuntu-container 10 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 11 | imagePullPolicy: IfNotPresent 12 | command: ["sleep","infinity"] 13 | resources: 14 | limits: 15 | metax-tech.com/gpu: 1 # requesting 1 vGPUs -------------------------------------------------------------------------------- /examples/metax/sgpu/allocate_exclusive.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep","infinity"] 11 | resources: 12 | limits: 13 | metax-tech.com/sgpu: 1 # requesting 1 exclusive GPU -------------------------------------------------------------------------------- /examples/metax/sgpu/allocate_specific_gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | metax-tech.com/use-gpuuuid: "36beae85-c835-6b14-6ab2-02671837a59c" # allocate specific gpu 7 | spec: 8 | containers: 9 | - name: ubuntu-container 10 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 11 | imagePullPolicy: IfNotPresent 12 | command: ["sleep","infinity"] 13 | resources: 14 | limits: 15 | metax-tech.com/sgpu: 1 # requesting 1 GPU 16 | metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores 17 | metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory -------------------------------------------------------------------------------- /examples/metax/sgpu/allocate_vmemory_MiB.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep","infinity"] 11 | resources: 12 | limits: 13 | metax-tech.com/sgpu: 1 # requesting 1 GPU 14 | metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores 15 | metax-tech.com/vmemory: 2048Mi # each GPU require 2048 MiB device memory -------------------------------------------------------------------------------- /examples/metax/sgpu/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep","infinity"] 11 | resources: 12 | limits: 13 | metax-tech.com/sgpu: 1 # requesting 1 GPU 14 | metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores 15 | metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory -------------------------------------------------------------------------------- /examples/metax/sgpu/multi-containers.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container-1 8 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep","infinity"] 11 | resources: 12 | limits: 13 | metax-tech.com/sgpu: 1 # requesting 1 GPU 14 | metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores 15 | metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory 16 | - name: ubuntu-container-2 17 | image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 18 | imagePullPolicy: IfNotPresent 19 | command: ["sleep","infinity"] 20 | resources: 21 | limits: 22 | metax-tech.com/sgpu: 1 # requesting 1 GPU 23 | metax-tech.com/vcore: 30 # each GPU use 30% of total compute cores 24 | metax-tech.com/vmemory: 8 # each GPU require 8 GiB device memory -------------------------------------------------------------------------------- /examples/mlu/allocate_whole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: binpack-1 5 | labels: 6 | app: binpack-1 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: binpack-1 12 | template: 13 | metadata: 14 | labels: 15 | app: binpack-1 16 | spec: 17 | containers: 18 | - name: c-1 19 | image: ubuntu:18.04 20 | command: ["sleep"] 21 | args: ["100000"] 22 | resources: 23 | limits: 24 | cambricon.com/vmlu: "1" #allocates a whole MLU -------------------------------------------------------------------------------- /examples/mlu/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: binpack-1 5 | labels: 6 | app: binpack-1 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: binpack-1 12 | template: 13 | metadata: 14 | labels: 15 | app: binpack-1 16 | spec: 17 | containers: 18 | - name: c-1 19 | image: ubuntu:18.04 20 | command: ["sleep"] 21 | args: ["100000"] 22 | resources: 23 | limits: 24 | cambricon.com/vmlu: "1" 25 | cambricon.com/mlu370.smlu.vmemory: "20" 26 | cambricon.com/mlu370.smlu.vcore: "10" -------------------------------------------------------------------------------- /examples/mthreads/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpushare-pod-default 5 | spec: 6 | restartPolicy: OnFailure 7 | containers: 8 | - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 9 | imagePullPolicy: IfNotPresent 10 | name: gpushare-pod-1 11 | command: ["sleep"] 12 | args: ["100000"] 13 | resources: 14 | limits: 15 | mthreads.com/vgpu: 1 16 | mthreads.com/sgpu-memory: 32 17 | mthreads.com/sgpu-core: 8 -------------------------------------------------------------------------------- /examples/mthreads/multi_cards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpushare-pod-multi-cards 5 | spec: 6 | restartPolicy: OnFailure 7 | containers: 8 | - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 9 | imagePullPolicy: IfNotPresent 10 | name: gpushare-pod-1 11 | command: ["sleep"] 12 | args: ["100000"] 13 | resources: 14 | limits: 15 | mthreads.com/vgpu: 2 -------------------------------------------------------------------------------- /examples/mthreads/use_exclusive.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpushare-pod-exclusive 5 | spec: 6 | restartPolicy: OnFailure 7 | containers: 8 | - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 9 | imagePullPolicy: IfNotPresent 10 | name: gpushare-pod-1 11 | command: ["sleep"] 12 | args: ["100000"] 13 | resources: 14 | limits: 15 | mthreads.com/vgpu: 1 -------------------------------------------------------------------------------- /examples/nvidia/default_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:22.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 13 | nvidia.com/gpumem: 3000 # identifies 3000M GPU memory each physical GPU allocates to the pod (Optional,Integer) 14 | nvidia.com/gpucores: 30 # identifies 30% GPU GPU core each physical GPU allocates to the pod (Optional,Integer) 15 | -------------------------------------------------------------------------------- /examples/nvidia/default_use_legacy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 13 | -------------------------------------------------------------------------------- /examples/nvidia/dynamic_mig_example.yaml: -------------------------------------------------------------------------------- 1 | ## This example will allocate 2g.10gb * 2 for A100-40GB-PCIE device 2 | ## or 1g.10gb * 2 for A100-80GB-XSM device. 3 | apiVersion: v1 4 | kind: Pod 5 | metadata: 6 | name: gpu-pod 7 | annotations: 8 | nvidia.com/vgpu-mode: "mig" 9 | hami.io/gpu-scheduler-policy: "binpack" #(Optional) 10 | spec: 11 | containers: 12 | - name: ubuntu-container 13 | image: ubuntu:18.04 14 | command: ["bash", "-c", "sleep 86400"] 15 | resources: 16 | limits: 17 | nvidia.com/gpu: 2 18 | nvidia.com/gpumem: 8000 19 | 20 | -------------------------------------------------------------------------------- /examples/nvidia/example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 13 | #nvidia.com/gpumem: 3000 # identifies 3000M GPU memory each physical GPU allocates to the pod 14 | nvidia.com/gpumem-percentage: 50 # identifies 50% GPU memory each physical GPU allocates to the pod. Can not be used with nvidia.com/gpumem 15 | #nvidia.com/gpucores: 90 # identifies 90% GPU GPU core each physical GPU allocates to the pod 16 | #nvidia.com/priority: 0 # we only have two priority class, 0(high) and 1(low), default: 1 17 | #The utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks. 18 | #The utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU. 19 | - name: ubuntu-container0 20 | image: ubuntu:18.04 21 | command: ["bash", "-c", "sleep 86400"] 22 | - name: ubuntu-container1 23 | image: ubuntu:18.04 24 | command: ["bash", "-c", "sleep 86400"] 25 | resources: 26 | limits: 27 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 28 | nvidia.com/gpumem: 2000 # identifies 2000M GPU memory each physical GPU allocates to the pod (Optional,Integer) 29 | #nvidia.com/gpucores: 90 # identifies 90% GPU GPU core each physical GPU allocates to the pod 30 | 31 | -------------------------------------------------------------------------------- /examples/nvidia/mig_example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/mig-3g.20gb: 1 # requesting 1 vGPUs 13 | -------------------------------------------------------------------------------- /examples/nvidia/specify_card_type_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get registered gpu info 7 | # The full GPU type name is like NVIDIA-NVIDIA A100, while the short name is like A100 8 | nvidia.com/nouse-gputype: "1080,2080" # Specify the blacklist card type for this job, use comma to seperate, will not launch job on specified card 9 | # In this example, we don't want our job to run on 1080(include 1080Ti) or 2080(include 2080Ti) type of card. 10 | spec: 11 | containers: 12 | - name: ubuntu-container 13 | image: ubuntu:18.04 14 | command: ["bash", "-c", "sleep 86400"] 15 | resources: 16 | limits: 17 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 18 | -------------------------------------------------------------------------------- /examples/nvidia/specify_card_type_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get registered gpu info 7 | # The full GPU type name is like NVIDIA-NVIDIA A100, while the short name is like A100 8 | nvidia.com/use-gputype: "A100,V100" # Specify the card type for this job, use comma to seperate, will launch job on specified card 9 | # In this example, we want to run this job on A100 or V100 10 | spec: 11 | containers: 12 | - name: ubuntu-container 13 | image: ubuntu:18.04 14 | command: ["bash", "-c", "sleep 86400"] 15 | resources: 16 | limits: 17 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 18 | -------------------------------------------------------------------------------- /examples/nvidia/specify_scheduling_policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | hami.io/node-scheduler-policy: "spread" # when this parameter is set to spread, the scheduler will try to allocate the pod to different GPU nodes for execution. 7 | hami.io/gpu-scheduler-policy: "binpack" # when this parameter is set to binpack, the scheduler will try to allocate the pod to the same GPU card for execution. 8 | spec: 9 | containers: 10 | - name: ubuntu-container 11 | image: ubuntu:18.04 12 | command: ["bash", "-c", "sleep 86400"] 13 | resources: 14 | limits: 15 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 16 | -------------------------------------------------------------------------------- /examples/nvidia/specify_uuid_not_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get gpu-type 7 | # UUID is like GPU-03f69c50-207a-2038-9b45-23cac89cb67d 8 | nvidia.com/nouse-gpuuuid: "GPU-03f69c50-207a-2038-9b45-23cac89cb67d" # Specify the blacklist card UUIDs for this job, use comma to seperate, will not launch job on specified cards 9 | # In this job, we don't want our job to run on GPU-03f69c50-207a-2038-9b45-23cac89cb67d. 10 | spec: 11 | containers: 12 | - name: ubuntu-container 13 | image: ubuntu:18.04 14 | command: ["bash", "-c", "sleep 86400"] 15 | resources: 16 | limits: 17 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs -------------------------------------------------------------------------------- /examples/nvidia/specify_uuid_to_use.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | annotations: 6 | # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get gpu-type 7 | # UUID is like GPU-03f69c50-207a-2038-9b45-23cac89cb67d 8 | nvidia.com/use-gpuuuid: "GPU-03f69c50-207a-2038-9b45-23cac89cb67d,GPU-03f69c50-207a-2038-9b45-23cac89cb67e" # Specify the card UUIDs for this job, separated by commas. The job will run on the specified cards 9 | # In this example, we want to run this job on GPU-03f69c50-207a-2038-9b45-23cac89cb67d or GPU-03f69c50-207a-2038-9b45-23cac89cb67e 10 | spec: 11 | containers: 12 | - name: ubuntu-container 13 | image: ubuntu:18.04 14 | command: ["bash", "-c", "sleep 86400"] 15 | resources: 16 | limits: 17 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 18 | -------------------------------------------------------------------------------- /examples/nvidia/use_as_normal.yaml: -------------------------------------------------------------------------------- 1 | # Gpu-pod1 and gpu-pod2 will NOT share the same GPU 2 | apiVersion: v1 3 | kind: Pod 4 | metadata: 5 | name: gpu-pod1 6 | spec: 7 | containers: 8 | - name: ubuntu-container 9 | image: ubuntu:18.04 10 | command: ["bash", "-c", "sleep 86400"] 11 | resources: 12 | limits: 13 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 14 | --- 15 | apiVersion: v1 16 | kind: Pod 17 | metadata: 18 | name: gpu-pod2 19 | spec: 20 | containers: 21 | - name: ubuntu-container 22 | image: ubuntu:18.04 23 | command: ["bash", "-c", "sleep 86400"] 24 | resources: 25 | limits: 26 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs -------------------------------------------------------------------------------- /examples/nvidia/use_exclusive_card.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod1 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 13 | nvidia.com/gpumem-percentage: 100 # identifies 100% GPU memory each physical GPU allocates to the pod (Optional,Integer) 14 | nvidia.com/gpucores: 100 # identifies 100% GPU GPU core each physical GPU allocates to the pod(Optional,Integer) 15 | -------------------------------------------------------------------------------- /examples/nvidia/use_memory_fraction.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-pod 5 | spec: 6 | containers: 7 | - name: ubuntu-container 8 | image: ubuntu:18.04 9 | command: ["bash", "-c", "sleep 86400"] 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs 13 | nvidia.com/gpumem-percentage: 50 # identifies 50% GPU memory each physical GPU allocates to the pod (Optional,Integer) 14 | nvidia.com/gpucores: 30 # identifies 30% GPU GPU core each physical GPU allocates to the pod (Optional,Integer) 15 | -------------------------------------------------------------------------------- /examples/nvidia/use_sharing_card.yaml: -------------------------------------------------------------------------------- 1 | # Gpu-pod1 and gpu-pod2 could share the same GPU 2 | apiVersion: v1 3 | kind: Pod 4 | metadata: 5 | name: gpu-pod1 6 | spec: 7 | containers: 8 | - name: ubuntu-container 9 | image: ubuntu:18.04 10 | command: ["bash", "-c", "sleep 86400"] 11 | resources: 12 | limits: 13 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 14 | nvidia.com/gpumem-percentage: 40 # identifies 40% GPU memory each physical GPU allocates to the pod (Optional,Integer) 15 | nvidia.com/gpucores: 60 # identifies 60% GPU GPU core each physical GPU allocates to the pod (Optional,Integer) 16 | --- 17 | apiVersion: v1 18 | kind: Pod 19 | metadata: 20 | name: gpu-pod2 21 | spec: 22 | containers: 23 | - name: ubuntu-container 24 | image: ubuntu:18.04 25 | command: ["bash", "-c", "sleep 86400"] 26 | resources: 27 | limits: 28 | nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs 29 | nvidia.com/gpumem-percentage: 60 # identifies 60% GPU memory each physical GPU allocates to the pod (Optional,Integer) 30 | nvidia.com/gpucores: 40 # identifies 40% GPU GPU core each physical GPU allocates to the pod (Optional,Integer) 31 | -------------------------------------------------------------------------------- /hack/.import-aliases: -------------------------------------------------------------------------------- 1 | { 2 | "k8s.io/api/admissionregistration/v1": "admissionregistrationv1", 3 | "k8s.io/api/admissionregistration/v1beta1": "admissionregistrationv1beta1", 4 | "k8s.io/api/admission/v1beta1": "admissionv1beta1", 5 | "k8s.io/api/admission/v1": "admissionv1", 6 | "k8s.io/api/apps/v1": "appsv1", 7 | "k8s.io/api/apps/v1beta1": "appsv1beta1", 8 | "k8s.io/api/apps/v1beta2": "appsv1beta2", 9 | "k8s.io/api/authentication/v1": "authenticationv1", 10 | "k8s.io/api/authentication/v1beta1": "authenticationv1beta1", 11 | "k8s.io/api/authorization/v1": "authorizationv1", 12 | "k8s.io/api/authorization/v1beta1": "authorizationv1beta1", 13 | "k8s.io/api/autoscaling/v1": "autoscalingv1", 14 | "k8s.io/api/autoscaling/v2": "autoscalingv2", 15 | "k8s.io/api/batch/v1": "batchv1", 16 | "k8s.io/api/batch/v1beta1": "batchv1beta1", 17 | "k8s.io/api/certificates/v1beta1": "certificatesv1beta1", 18 | "k8s.io/api/coordination/v1": "coordinationv1", 19 | "k8s.io/api/coordination/v1beta1": "coordinationv1beta1", 20 | "k8s.io/api/core/v1": "corev1", 21 | "k8s.io/api/discovery/v1": "discoveryv1", 22 | "k8s.io/api/events/v1": "eventsv1", 23 | "k8s.io/api/events/v1beta1": "eventsv1beta1", 24 | "k8s.io/api/extensions/v1beta1": "extensionsv1beta1", 25 | "k8s.io/api/imagepolicy/v1alpha1": "imagepolicyv1alpha1", 26 | "k8s.io/api/networking/v1": "networkingv1", 27 | "k8s.io/api/networking/v1beta1": "networkingv1beta1", 28 | "k8s.io/api/node/v1alpha1": "nodev1alpha1", 29 | "k8s.io/api/node/v1beta1": "nodev1beta1", 30 | "k8s.io/api/node/v1": "nodev1", 31 | "k8s.io/api/policy/v1": "policyv1", 32 | "k8s.io/api/policy/v1beta1": "policyv1beta1", 33 | "k8s.io/api/rbac/v1": "rbacv1", 34 | "k8s.io/api/rbac/v1alpha1": "rbacv1alpha1", 35 | "k8s.io/api/rbac/v1beta1": "rbacv1beta1", 36 | "k8s.io/api/scheduling/v1": "schedulingv1", 37 | "k8s.io/api/scheduling/v1alpha1": "schedulingv1alpha1", 38 | "k8s.io/api/scheduling/v1beta1": "schedulingv1beta1", 39 | "k8s.io/api/storage/v1": "storagev1", 40 | "k8s.io/api/storage/v1alpha1": "storagev1alpha1", 41 | "k8s.io/api/storage/v1beta1": "storagev1beta1", 42 | "k8s.io/apimachinery/pkg/api/errors": "apierrors", 43 | "k8s.io/apimachinery/pkg/apis/meta/v1": "metav1", 44 | "k8s.io/kubelet/apis/stats/v1alpha1": "kubeletstatsv1alpha1", 45 | "k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha": "kubeletdevicepluginv1alpha", 46 | "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1": "kubeletdevicepluginv1beta1", 47 | "k8s.io/kubelet/pkg/apis/pluginregistration/v1": "kubeletpluginregistrationv1", 48 | "k8s.io/kubelet/pkg/apis/pluginregistration/v1alpha1": "kubeletpluginregistrationv1alpha1", 49 | "k8s.io/kubelet/pkg/apis/pluginregistration/v1beta1": "kubeletpluginregistrationv1beta1", 50 | "k8s.io/kubelet/pkg/apis/podresources/v1alpha1": "kubeletpodresourcesv1alpha1" 51 | } 52 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | -------------------------------------------------------------------------------- /hack/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2024 HAMi Authors 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | set -e 18 | [[ -z ${SHORT_VERSION} ]] && SHORT_VERSION=$(git rev-parse --abbrev-ref HEAD) 19 | [[ -z ${COMMIT_CODE} ]] && COMMIT_CODE=$(git describe --abbrev=100 --always) 20 | 21 | export SHORT_VERSION 22 | export COMMIT_CODE 23 | export VERSION="${SHORT_VERSION}-${COMMIT_CODE}" 24 | export LATEST_VERSION="latest" 25 | export GOLANG_IMAGE="golang:1.22.5-bullseye" 26 | export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04" 27 | export DEST_DIR="/usr/local" 28 | 29 | IMAGE=${IMAGE-"projecthami/hami"} 30 | 31 | function go_build() { 32 | [[ -z "$J" ]] && J=$(nproc | awk '{print int(($0 + 1)/ 2)}') 33 | make -j$J 34 | } 35 | 36 | function docker_build() { 37 | docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . 38 | docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}" 39 | docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}" 40 | } 41 | 42 | function docker_push() { 43 | #docker push "${IMAGE}:${VERSION}" 44 | docker push "${IMAGE}:${SHORT_VERSION}" 45 | docker push "${IMAGE}:${LATEST_VERSION}" 46 | } 47 | 48 | go_build 49 | docker_build 50 | docker_push 51 | -------------------------------------------------------------------------------- /hack/e2e-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | set -x 21 | 22 | E2E_TYPE=${1:-"pullrequest"} 23 | KUBE_CONF=${2:-""} 24 | 25 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 26 | source "${REPO_ROOT}"/hack/util.sh 27 | 28 | if util::cmd_exist ginkgo; then 29 | echo "Using ginkgo version:" 30 | ginkgo version 31 | else 32 | go install github.com/onsi/ginkgo/v2/ginkgo 33 | go get github.com/onsi/gomega/... 34 | ginkgo version 35 | fi 36 | 37 | 38 | if [ -z "${KUBE_CONF}" ]; then 39 | echo "Error: KUBE_CONF environment variable is not set." 40 | return 1 41 | fi 42 | 43 | # Run e2e 44 | if [ "${E2E_TYPE}" == "pullrequest" ] || [ "${E2E_TYPE}" == "release" ]; then 45 | ginkgo -v -r --fail-fast ./test/e2e/ --kubeconfig="${KUBE_CONF}" 46 | if [ $? -ne 0 ]; then 47 | echo "Error: ginkgo command failed." 48 | return 1 49 | fi 50 | else 51 | echo "Invalid E2E Type: ${E2E_TYPE}" 52 | return 1 53 | fi 54 | -------------------------------------------------------------------------------- /hack/kubeconfig-demo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | clusters: 3 | - cluster: 4 | server: http://localhost:8080 5 | name: local-server 6 | contexts: 7 | - context: 8 | cluster: local-server 9 | namespace: the-right-prefix 10 | user: myself 11 | name: default-context 12 | current-context: default-context 13 | kind: Config 14 | preferences: {} 15 | users: 16 | - name: myself 17 | user: 18 | password: secret 19 | username: admin 20 | -------------------------------------------------------------------------------- /hack/tools/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | 3 | /* 4 | Copyright 2024 The HAMi Authors. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package tools 20 | 21 | import ( 22 | _ "golang.org/x/tools/cmd/goimports" 23 | ) 24 | -------------------------------------------------------------------------------- /hack/unit-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | set -x 21 | 22 | # init kubeconfig env 23 | kubeconfig_path="${HOME}/.kube" 24 | kubeconfig_file="${kubeconfig_path}/config" 25 | kubeconfig_demo="./hack/kubeconfig-demo.yaml" 26 | 27 | echo "kubeconfig: ${kubeconfig_file}" 28 | 29 | if [ ! -f "$kubeconfig_file" ]; then 30 | echo "Generate fake kubeconfig" 31 | if [ ! -d "${kubeconfig_path}" ]; then 32 | trap 'rm -rf "$kubeconfig_path"' EXIT 33 | mkdir -p "${kubeconfig_path}" 34 | cp ${kubeconfig_demo} "${kubeconfig_file}" 35 | else 36 | trap 'rm -f "$kubeconfig_file"' EXIT 37 | cp ${kubeconfig_demo} "${kubeconfig_file}" 38 | fi 39 | else 40 | echo "Use local kubeconfig" 41 | fi 42 | 43 | mkdir -p ./_output/coverage/ 44 | mergeF="./_output/coverage/merge.out" 45 | rm -f ${mergeF} 46 | cov_file="./_output/coverage/coverage_pkg.txt" 47 | go test $(go list ./pkg/... | grep -v ./pkg/device-plugin/...) -short --race -count=1 -covermode=atomic -coverprofile=${cov_file} 48 | cat $cov_file | grep -v mode: | grep -v pkg/version | grep -v fake | grep -v main.go >>${mergeF} 49 | #merge them 50 | echo "mode: atomic" >coverage.out 51 | cat ${mergeF} >>./_output/coverage/coverage.out 52 | go tool cover -func=coverage.out 53 | -------------------------------------------------------------------------------- /hack/update-generated-api.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2024 HAMi Authors 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | ROOT_DIR=$(dirname "${BASH_SOURCE[0]}")/.. 19 | protoc -I${ROOT_DIR} --gofast_out=plugins=grpc:${ROOT_DIR} ${ROOT_DIR}/pkg/api/*.proto -------------------------------------------------------------------------------- /hack/verify-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 21 | 22 | # Show progress 23 | set -x 24 | 25 | # Orders are determined by two factors: 26 | # (1) Less Execution time item should be executed first. 27 | # (2) More likely to fail item should be executed first. 28 | 29 | bash "$REPO_ROOT/hack/verify-staticcheck.sh" 30 | 31 | bash "$REPO_ROOT/hack/verify-license.sh" 32 | 33 | bash "$REPO_ROOT/hack/verify-import-aliases.sh" 34 | -------------------------------------------------------------------------------- /hack/verify-chart-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 21 | cd "${REPO_ROOT}" 22 | 23 | source "${REPO_ROOT}"/hack/util.sh 24 | 25 | # install helm 26 | echo -n "Preparing: 'helm' existence check - " 27 | if util::cmd_exist helm; then 28 | echo "passed" 29 | else 30 | echo "installing helm" 31 | util::install_helm 32 | fi 33 | 34 | APP_VERSION=$(helm show chart ./charts/hami | grep '^appVersion' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ') 35 | VERSION=$(helm show chart ./charts/hami | grep '^version' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ') 36 | 37 | if [[ ${APP_VERSION} != ${VERSION} ]]; then 38 | echo "AppVersion of HAMi is ${APP_VERSION}, but version is ${VERSION}!" 39 | exit 1 40 | fi 41 | 42 | echo "Both appVersion and version is ${APP_VERSION}." 43 | 44 | -------------------------------------------------------------------------------- /hack/verify-import-aliases.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 21 | cd "${SCRIPT_ROOT}" 22 | ROOT_PATH=$(pwd) 23 | 24 | IMPORT_ALIASES_PATH="${ROOT_PATH}/hack/.import-aliases" 25 | INCLUDE_PATH="(${ROOT_PATH}/cmd|${ROOT_PATH}/pkg)" 26 | 27 | ret=0 28 | # We can't directly install preferredimports by `go install` due to the go.mod issue: 29 | # go install k8s.io/kubernetes/cmd/preferredimports@v1.21.3: k8s.io/kubernetes@v1.21.3 30 | # The go.mod file for the module providing named packages contains one or 31 | # more replace directives. It must not contain directives that would cause 32 | # it to be interpreted differently than if it were the main module. 33 | go run "${ROOT_PATH}/hack/tools/preferredimports/preferredimports.go" -import-aliases "${IMPORT_ALIASES_PATH}" -include-path "${INCLUDE_PATH}" "${ROOT_PATH}" || ret=$? 34 | if [[ $ret -ne 0 ]]; then 35 | echo "!!! Please see hack/.import-aliases for the preferred aliases for imports." >&2 36 | exit 1 37 | fi 38 | echo "Passed import-aliases verification." 39 | -------------------------------------------------------------------------------- /hack/verify-license.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | set -ex 20 | 21 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 22 | cd "${REPO_ROOT}" 23 | 24 | if [[ "$(which addlicense)" == "" ]]; then 25 | go install github.com/google/addlicense@v1.1.1 26 | fi 27 | ADDLICENSE_BIN=$(which addlicense) 28 | 29 | # verify presence of license headers and exit with non-zero code if missing 30 | missing_license_header_files="$($ADDLICENSE_BIN \ 31 | -check \ 32 | -ignore "benchmarks/**" \ 33 | -ignore "charts/**" \ 34 | -ignore "docs/**" \ 35 | -ignore "docker/**" \ 36 | -ignore "examples/**" \ 37 | -ignore "lib/**" \ 38 | -ignore "libvgpu/**" \ 39 | -ignore "third_party/**" \ 40 | -ignore "vendor/**" \ 41 | -ignore "_output/**" \ 42 | -ignore ".github/**" \ 43 | -ignore "**/*.md" \ 44 | -ignore "**/*.yaml" \ 45 | -ignore "**/*.yml" \ 46 | -ignore "**/*.json" \ 47 | -ignore ".idea/**" \ 48 | .)" || true 49 | 50 | if [[ "$missing_license_header_files" ]]; then 51 | echo "Files with no license header detected:" 52 | echo "$missing_license_header_files" 53 | echo "Please add all missing license headers." 54 | exit 1 55 | fi 56 | 57 | echo "Congratulations! All files have passed license header check." 58 | -------------------------------------------------------------------------------- /hack/verify-staticcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2024 The HAMi Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o errexit 17 | set -o nounset 18 | set -o pipefail 19 | 20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 21 | GOLANGCI_LINT_VER="v2.1.1" 22 | 23 | cd "${REPO_ROOT}" 24 | source "hack/util.sh" 25 | 26 | if util::cmd_exist golangci-lint; then 27 | echo "Using golangci-lint version:" 28 | golangci-lint version 29 | else 30 | echo "Installing golangci-lint ${GOLANGCI_LINT_VER}" 31 | # https://golangci-lint.run/usage/install/#other-ci 32 | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s -- -b $(go env GOPATH)/bin ${GOLANGCI_LINT_VER} 33 | fi 34 | 35 | if golangci-lint run; then 36 | echo 'Congratulations! All Go source files have passed staticcheck.' 37 | else 38 | echo # print one empty line, separate from warning messages. 39 | echo 'Please review the above warnings.' 40 | echo 'If the above warnings do not make sense, feel free to file an issue.' 41 | exit 1 42 | fi 43 | -------------------------------------------------------------------------------- /imgs/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/arch.png -------------------------------------------------------------------------------- /imgs/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark.png -------------------------------------------------------------------------------- /imgs/benchmark_inf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark_inf.png -------------------------------------------------------------------------------- /imgs/benchmark_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark_train.png -------------------------------------------------------------------------------- /imgs/cncf-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/cncf-logo.png -------------------------------------------------------------------------------- /imgs/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/example.png -------------------------------------------------------------------------------- /imgs/hami-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-arch.png -------------------------------------------------------------------------------- /imgs/hami-arch.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-arch.pptx -------------------------------------------------------------------------------- /imgs/hami-graph-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-graph-color.png -------------------------------------------------------------------------------- /imgs/hami-horizontal-colordark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-horizontal-colordark.png -------------------------------------------------------------------------------- /imgs/hami-vgpu-metrics-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-vgpu-metrics-dashboard.png -------------------------------------------------------------------------------- /imgs/hard_limit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hard_limit.jpg -------------------------------------------------------------------------------- /imgs/metax_binpack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_binpack.png -------------------------------------------------------------------------------- /imgs/metax_spread.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_spread.png -------------------------------------------------------------------------------- /imgs/metax_topo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_topo.png -------------------------------------------------------------------------------- /imgs/release-process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/release-process.png -------------------------------------------------------------------------------- /lib/nvidia/ld.so.preload: -------------------------------------------------------------------------------- 1 | /usr/local/vgpu/libvgpu.so -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package cdi 34 | 35 | // Interface provides the API to the 'cdi' package 36 | // 37 | //go:generate moq -stub -out api_mock.go . Interface 38 | type Interface interface { 39 | CreateSpecFile() error 40 | QualifiedName(string, string) string 41 | } 42 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package cdi 34 | 35 | import ( 36 | "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" 37 | 38 | "k8s.io/klog/v2" 39 | ) 40 | 41 | // New is a factory method that creates a CDI handler for creating CDI specs. 42 | func New(opts ...Option) (Interface, error) { 43 | infolib := info.New() 44 | 45 | hasNVML, _ := infolib.HasNvml() 46 | if !hasNVML { 47 | klog.Warning("No valid resources detected, creating a null CDI handler") 48 | return NewNullHandler(), nil 49 | } 50 | 51 | return newHandler(opts...) 52 | } 53 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package cdi 34 | 35 | import ( 36 | "k8s.io/klog/v2" 37 | ) 38 | 39 | type null struct{} 40 | 41 | var _ Interface = &null{} 42 | 43 | // NewNullHandler returns an instance of the 'cdi' interface that can 44 | // be used when CDI specs are not required. 45 | func NewNullHandler() Interface { 46 | return &null{} 47 | } 48 | 49 | // CreateSpecFile is a no-op for the null handler. 50 | func (n *null) CreateSpecFile() error { 51 | return nil 52 | } 53 | 54 | // QualifiedName is a no-op for the null handler. A error message is logged 55 | // inidicating this should never be called for the null handler. 56 | func (n *null) QualifiedName(class string, id string) string { 57 | klog.Error("cannot return a qualified CDI device name with the null CDI handler") 58 | return "" 59 | } 60 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/info/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package info 34 | 35 | import "strings" 36 | 37 | // version must be set by go build's -X main.version= option in the Makefile. 38 | var version = "unknown" 39 | 40 | // gitCommit will be the hash that the binary was built from 41 | // and will be populated by the Makefile. 42 | var gitCommit = "" 43 | 44 | // GetVersionParts returns the different version components. 45 | func GetVersionParts() []string { 46 | v := []string{version} 47 | 48 | if gitCommit != "" { 49 | v = append(v, "commit: "+gitCommit) 50 | } 51 | 52 | return v 53 | } 54 | 55 | // GetVersionString returns the string representation of the version. 56 | func GetVersionString(more ...string) string { 57 | v := append(GetVersionParts(), more...) 58 | return strings.Join(v, "\n") 59 | } 60 | 61 | // GetVersion returns the version of the binary. 62 | func GetVersion() string { 63 | return version 64 | } 65 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package plugin 34 | 35 | import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" 36 | 37 | // Interface defines the API for the plugin package 38 | type Interface interface { 39 | Devices() rm.Devices 40 | Start() error 41 | Stop() error 42 | } 43 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package manager 34 | 35 | import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 36 | 37 | // Interface defines the API for the plugin manager package 38 | type Interface interface { 39 | GetPlugins() ([]plugin.Interface, error) 40 | CreateCDISpecFile() error 41 | } 42 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package manager 34 | 35 | import ( 36 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 37 | ) 38 | 39 | type null struct{} 40 | 41 | // GetPlugins returns an empty set of Plugins for the null manager 42 | func (m *null) GetPlugins() ([]plugin.Interface, error) { 43 | return nil, nil 44 | } 45 | 46 | // CreateCDISpecFile creates the spec is a no-op for the null plugin 47 | func (m *null) CreateCDISpecFile() error { 48 | return nil 49 | } 50 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package manager 34 | 35 | import ( 36 | "fmt" 37 | 38 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 39 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" 40 | ) 41 | 42 | type nvmlmanager manager 43 | 44 | // GetPlugins returns the plugins associated with the NVML resources available on the node 45 | func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) { 46 | sConfig, mode, err := plugin.LoadNvidiaDevicePluginConfig() 47 | if err != nil { 48 | return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err) 49 | } 50 | 51 | rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config) 52 | if err != nil { 53 | return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) 54 | } 55 | 56 | var plugins []plugin.Interface 57 | for _, r := range rms { 58 | plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled, sConfig, mode)) 59 | } 60 | return plugins, nil 61 | } 62 | 63 | // CreateCDISpecFile creates forwards the request to the CDI handler 64 | func (m *nvmlmanager) CreateCDISpecFile() error { 65 | return m.cdiHandler.CreateSpecFile() 66 | } 67 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package manager 34 | 35 | import ( 36 | "github.com/NVIDIA/go-nvlib/pkg/nvml" 37 | 38 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" 39 | "github.com/Project-HAMi/HAMi/pkg/device/nvidia" 40 | ) 41 | 42 | // Option is a function that configures a manager 43 | type Option func(*manager) 44 | 45 | // WithCDIEnabled sets whether CDI is enabled for the manager 46 | func WithCDIEnabled(enabled bool) Option { 47 | return func(m *manager) { 48 | m.cdiEnabled = enabled 49 | } 50 | } 51 | 52 | // WithCDIHandler sets the CDI handler for the manager 53 | func WithCDIHandler(handler cdi.Interface) Option { 54 | return func(m *manager) { 55 | m.cdiHandler = handler 56 | } 57 | } 58 | 59 | // WithNVML sets the NVML handler for the manager 60 | func WithNVML(nvmllib nvml.Interface) Option { 61 | return func(m *manager) { 62 | m.nvmllib = nvmllib 63 | } 64 | } 65 | 66 | // WithFailOnInitError sets whether the manager should fail on initialization errors 67 | func WithFailOnInitError(failOnInitError bool) Option { 68 | return func(m *manager) { 69 | m.failOnInitError = failOnInitError 70 | } 71 | } 72 | 73 | // WithMigStrategy sets the MIG strategy for the manager 74 | func WithMigStrategy(migStrategy string) Option { 75 | return func(m *manager) { 76 | m.migStrategy = migStrategy 77 | } 78 | } 79 | 80 | // WithConfig sets the config reference for the manager 81 | func WithConfig(config *nvidia.DeviceConfig) Option { 82 | return func(m *manager) { 83 | m.config = config 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package manager 34 | 35 | import ( 36 | "fmt" 37 | 38 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" 39 | "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" 40 | ) 41 | 42 | type tegramanager manager 43 | 44 | // GetPlugins returns the plugins associated with the NVML resources available on the node 45 | func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) { 46 | sConfig, mode, err := plugin.LoadNvidiaDevicePluginConfig() 47 | if err != nil { 48 | return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err) 49 | } 50 | 51 | rms, err := rm.NewTegraResourceManagers(m.config) 52 | if err != nil { 53 | return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) 54 | } 55 | 56 | var plugins []plugin.Interface 57 | for _, r := range rms { 58 | plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled, sConfig, mode)) 59 | } 60 | return plugins, nil 61 | } 62 | 63 | // CreateCDISpecFile creates the spec is a no-op for the tegra plugin 64 | func (m *tegramanager) CreateCDISpecFile() error { 65 | return nil 66 | } 67 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package plugin 34 | 35 | import "testing" 36 | 37 | func Test_parseNvidiaNumaInfo(t *testing.T) { 38 | 39 | tests := []struct { 40 | name string 41 | idx int 42 | nvidiaTopoStr string 43 | want int 44 | wantErr bool 45 | }{ 46 | { 47 | name: "single Tesla P4 NUMA", 48 | idx: 0, 49 | nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity ... 50 | ...`, 51 | want: 0, 52 | wantErr: false, 53 | }, 54 | { 55 | name: "two Tesla P4 NUMA topo with index 0", 56 | idx: 0, 57 | nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... 58 | ...`, 59 | want: 0, 60 | wantErr: false, 61 | }, 62 | { 63 | name: "two Tesla P4 NUMA topo with index 1", 64 | idx: 1, 65 | nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... 66 | ...`, 67 | want: 0, 68 | wantErr: false, 69 | }, 70 | { 71 | name: "NUMA Affinity is empty", 72 | idx: 0, 73 | nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 74 | GPU0 X`, 75 | want: 0, 76 | wantErr: false, 77 | }, 78 | } 79 | 80 | for _, tt := range tests { 81 | t.Run(tt.name, func(t *testing.T) { 82 | got, err := parseNvidiaNumaInfo(tt.idx, tt.nvidiaTopoStr) 83 | if (err != nil) != tt.wantErr { 84 | t.Errorf("parseNvidiaNumaInfo() error = %v, wantErr %v", err, tt.wantErr) 85 | return 86 | } 87 | if got != tt.want { 88 | t.Errorf("parseNvidiaNumaInfo() got = %v, want %v", got, tt.want) 89 | } 90 | }) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package rm 34 | 35 | // int8Slice wraps an []int8 with more functions. 36 | type int8Slice []int8 37 | 38 | // String turns a nil terminated int8Slice into a string 39 | func (s int8Slice) String() string { 40 | var b []byte 41 | for _, c := range s { 42 | if c == 0 { 43 | break 44 | } 45 | b = append(b, byte(c)) 46 | } 47 | return string(b) 48 | } 49 | 50 | // uintPtr returns a *uint from a uint32 51 | func uintPtr(c uint32) *uint { 52 | i := uint(c) 53 | return &i 54 | } 55 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package rm 34 | 35 | import ( 36 | "fmt" 37 | 38 | "github.com/Project-HAMi/HAMi/pkg/device/nvidia" 39 | ) 40 | 41 | const ( 42 | tegraDeviceName = "tegra" 43 | ) 44 | 45 | // buildTegraDeviceMap creates a DeviceMap for the tegra devices in the sytesm. 46 | // NOTE: At present only a single tegra device is expected. 47 | func buildTegraDeviceMap(config *nvidia.DeviceConfig) (DeviceMap, error) { 48 | devices := make(DeviceMap) 49 | 50 | name := tegraDeviceName 51 | i := 0 52 | for _, resource := range config.Resources.GPUs { 53 | if resource.Pattern.Matches(name) { 54 | index := fmt.Sprintf("%d", i) 55 | err := devices.setEntry(resource.Name, index, &tegraDevice{}) 56 | if err != nil { 57 | return nil, err 58 | } 59 | i++ 60 | } 61 | 62 | } 63 | return devices, nil 64 | } 65 | 66 | type tegraDevice struct{} 67 | 68 | var _ deviceInfo = (*tegraDevice)(nil) 69 | 70 | // GetUUID returns the UUID of the tegra device. 71 | // TODO: This is currently hardcoded to `tegra` 72 | func (d *tegraDevice) GetUUID() (string, error) { 73 | return tegraDeviceName, nil 74 | } 75 | 76 | // GetPaths returns the paths for a tegra device. 77 | // A tegra device does not have paths associated with it. 78 | func (d *tegraDevice) GetPaths() ([]string, error) { 79 | return nil, nil 80 | } 81 | 82 | // GetNumaNode always returns unsupported for a Tegra device 83 | func (d *tegraDevice) GetNumaNode() (bool, int, error) { 84 | return false, -1, nil 85 | } 86 | -------------------------------------------------------------------------------- /pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * 4 | * The HAMi Contributors require contributions made to 5 | * this file be licensed under the Apache-2.0 license or a 6 | * compatible open source license. 7 | */ 8 | 9 | /* 10 | * Licensed to NVIDIA CORPORATION under one or more contributor 11 | * license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright 13 | * ownership. NVIDIA CORPORATION licenses this file to you under 14 | * the Apache License, Version 2.0 (the "License"); you may 15 | * not use this file except in compliance with the License. 16 | * You may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, 21 | * software distributed under the License is distributed on an 22 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 23 | * KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations 25 | * under the License. 26 | */ 27 | 28 | /* 29 | * Modifications Copyright The HAMi Authors. See 30 | * GitHub history for details. 31 | */ 32 | 33 | package rm 34 | 35 | type wslDevice nvmlDevice 36 | 37 | var _ deviceInfo = (*wslDevice)(nil) 38 | 39 | // GetUUID returns the UUID of the device 40 | func (d wslDevice) GetUUID() (string, error) { 41 | return nvmlDevice(d).GetUUID() 42 | } 43 | 44 | // GetPaths returns the paths for a tegra device. 45 | func (d wslDevice) GetPaths() ([]string, error) { 46 | return []string{"/dev/dxg"}, nil 47 | } 48 | 49 | // GetNumaNode returns the NUMA node associated with the GPU device 50 | func (d wslDevice) GetNumaNode() (bool, int, error) { 51 | return nvmlDevice(d).GetNumaNode() 52 | } 53 | -------------------------------------------------------------------------------- /pkg/device/ascend/vnpu.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package ascend 18 | 19 | type Template struct { 20 | Name string `yaml:"name"` 21 | Memory int64 `yaml:"memory"` 22 | AICore int32 `yaml:"aiCore,omitempty"` 23 | AICPU int32 `yaml:"aiCPU,omitempty"` 24 | } 25 | 26 | type VNPUConfig struct { 27 | CommonWord string `yaml:"commonWord"` 28 | ChipName string `yaml:"chipName"` 29 | ResourceName string `yaml:"resourceName"` 30 | ResourceMemoryName string `yaml:"resourceMemoryName"` 31 | MemoryAllocatable int64 `yaml:"memoryAllocatable"` 32 | MemoryCapacity int64 `yaml:"memoryCapacity"` 33 | AICore int32 `yaml:"aiCore"` 34 | AICPU int32 `yaml:"aiCPU"` 35 | Templates []Template `yaml:"templates"` 36 | } 37 | -------------------------------------------------------------------------------- /pkg/device/metax/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metax 18 | 19 | import "flag" 20 | 21 | type MetaxConfig struct { 22 | // GPU 23 | ResourceCountName string `yaml:"resourceCountName"` 24 | 25 | // SGPU 26 | ResourceVCountName string `yaml:"resourceVCountName"` 27 | ResourceVMemoryName string `yaml:"resourceVMemoryName"` 28 | ResourceVCoreName string `yaml:"resourceVCoreName"` 29 | } 30 | 31 | func ParseConfig(fs *flag.FlagSet) { 32 | // GPU 33 | fs.StringVar(&MetaxResourceCount, "metax-name", "metax-tech.com/gpu", "metax resource count") 34 | 35 | // SGPU 36 | fs.StringVar(&MetaxResourceNameVCount, "metax-vcount", "metax-tech.com/sgpu", "metax vcount name") 37 | fs.StringVar(&MetaxResourceNameVCore, "metax-vcore", "metax-tech.com/vcore", "metax vcore name") 38 | fs.StringVar(&MetaxResourceNameVMemory, "metax-vmemory", "metax-tech.com/vmemory", "metax vmemory name") 39 | } 40 | -------------------------------------------------------------------------------- /pkg/k8sutil/pod.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package k8sutil 18 | 19 | import ( 20 | "github.com/Project-HAMi/HAMi/pkg/device" 21 | "github.com/Project-HAMi/HAMi/pkg/util" 22 | 23 | corev1 "k8s.io/api/core/v1" 24 | "k8s.io/klog/v2" 25 | ) 26 | 27 | func Resourcereqs(pod *corev1.Pod) (counts util.PodDeviceRequests) { 28 | counts = make(util.PodDeviceRequests, len(pod.Spec.Containers)) 29 | klog.V(4).InfoS("Processing resource requirements", 30 | "pod", klog.KObj(pod), 31 | "containerCount", len(pod.Spec.Containers)) 32 | //Count Nvidia GPU 33 | cnt := int32(0) 34 | for i := range pod.Spec.Containers { 35 | devices := device.GetDevices() 36 | counts[i] = make(util.ContainerDeviceRequests) 37 | klog.V(5).InfoS("Processing container resources", 38 | "pod", klog.KObj(pod), 39 | "containerIndex", i, 40 | "containerName", pod.Spec.Containers[i].Name) 41 | for idx, val := range devices { 42 | request := val.GenerateResourceRequests(&pod.Spec.Containers[i]) 43 | if request.Nums > 0 { 44 | cnt += request.Nums 45 | counts[i][idx] = val.GenerateResourceRequests(&pod.Spec.Containers[i]) 46 | } 47 | } 48 | } 49 | if cnt == 0 { 50 | klog.V(4).InfoS("No device requests found", "pod", klog.KObj(pod)) 51 | } else { 52 | klog.V(4).InfoS("Resource requirements collected", "pod", klog.KObj(pod), "requests", counts) 53 | } 54 | return counts 55 | } 56 | 57 | func IsPodInTerminatedState(pod *corev1.Pod) bool { 58 | return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded 59 | } 60 | 61 | func AllContainersCreated(pod *corev1.Pod) bool { 62 | return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers) 63 | } 64 | -------------------------------------------------------------------------------- /pkg/oci/runtime.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | // Runtime is an interface for a runtime shim. The Exec method accepts a list 20 | // of command line arguments, and returns an error / nil. 21 | type Runtime interface { 22 | Exec([]string) error 23 | } 24 | -------------------------------------------------------------------------------- /pkg/oci/runtime_exec.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "syscall" 23 | 24 | log "github.com/sirupsen/logrus" 25 | ) 26 | 27 | // SyscallExecRuntime wraps the path that a binary and defines the semantics for how to exec into it. 28 | // This can be used to wrap an OCI-compliant low-level runtime binary, allowing it to be used through the 29 | // Runtime internface. 30 | type SyscallExecRuntime struct { 31 | logger *log.Logger 32 | path string 33 | // exec is used for testing. This defaults to syscall.Exec 34 | exec func(argv0 string, argv []string, envv []string) error 35 | } 36 | 37 | var _ Runtime = (*SyscallExecRuntime)(nil) 38 | 39 | // NewSyscallExecRuntime creates a SyscallExecRuntime for the specified path with the standard logger. 40 | func NewSyscallExecRuntime(path string) (Runtime, error) { 41 | return NewSyscallExecRuntimeWithLogger(log.StandardLogger(), path) 42 | } 43 | 44 | // NewSyscallExecRuntimeWithLogger creates a SyscallExecRuntime for the specified logger and path. 45 | func NewSyscallExecRuntimeWithLogger(logger *log.Logger, path string) (Runtime, error) { 46 | info, err := os.Stat(path) 47 | if err != nil { 48 | return nil, fmt.Errorf("invalid path '%v': %v", path, err) 49 | } 50 | if info.IsDir() || info.Mode()&0111 == 0 { 51 | return nil, fmt.Errorf("specified path '%v' is not an executable file", path) 52 | } 53 | 54 | shim := SyscallExecRuntime{ 55 | logger: logger, 56 | path: path, 57 | exec: syscall.Exec, 58 | } 59 | 60 | return &shim, nil 61 | } 62 | 63 | // Exec exces into the binary at the path from the SyscallExecRuntime struct, passing it the supplied arguments 64 | // after ensuring that the first argument is the path of the target binary. 65 | func (s SyscallExecRuntime) Exec(args []string) error { 66 | runtimeArgs := []string{s.path} 67 | if len(args) > 1 { 68 | runtimeArgs = append(runtimeArgs, args[1:]...) 69 | } 70 | 71 | err := s.exec(s.path, runtimeArgs, os.Environ()) 72 | if err != nil { 73 | return fmt.Errorf("could not exec '%v': %v", s.path, err) 74 | } 75 | 76 | // syscall.Exec is not expected to return. This is an error state regardless of whether 77 | // err is nil or not. 78 | return fmt.Errorf("unexpected return from exec '%v'", s.path) 79 | } 80 | -------------------------------------------------------------------------------- /pkg/oci/runtime_mock.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | // MockExecRuntime wraps a SyscallExecRuntime, intercepting the exec call for testing. 20 | type MockExecRuntime struct { 21 | SyscallExecRuntime 22 | execMock 23 | } 24 | 25 | // WithMockExec wraps a specified SyscallExecRuntime with a mocked exec function for testing. 26 | func WithMockExec(e SyscallExecRuntime, execResult error) *MockExecRuntime { 27 | m := MockExecRuntime{ 28 | SyscallExecRuntime: e, 29 | execMock: execMock{result: execResult}, 30 | } 31 | // overrdie the exec function to the mocked exec function. 32 | m.SyscallExecRuntime.exec = m.execMock.exec 33 | return &m 34 | } 35 | 36 | type execMock struct { 37 | argv0 string 38 | argv []string 39 | envv []string 40 | result error 41 | } 42 | 43 | func (m *execMock) exec(argv0 string, argv []string, envv []string) error { 44 | m.argv0 = argv0 45 | m.argv = argv 46 | m.envv = envv 47 | 48 | return m.result 49 | } 50 | -------------------------------------------------------------------------------- /pkg/oci/spec_mock.go: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | */ 16 | 17 | package oci 18 | 19 | import ( 20 | oci "github.com/opencontainers/runtime-spec/specs-go" 21 | ) 22 | 23 | // MockSpec provides a simple mock for an OCI spec to be used in testing. 24 | // It also implements the SpecModifier interface. 25 | type MockSpec struct { 26 | *oci.Spec 27 | MockLoad mockFunc 28 | MockFlush mockFunc 29 | MockModify mockFunc 30 | } 31 | 32 | var _ Spec = (*MockSpec)(nil) 33 | 34 | // NewMockSpec constructs a MockSpec to be used in testing as a Spec. 35 | func NewMockSpec(spec *oci.Spec, flushResult error, modifyResult error) *MockSpec { 36 | s := MockSpec{ 37 | Spec: spec, 38 | MockFlush: mockFunc{result: flushResult}, 39 | MockModify: mockFunc{result: modifyResult}, 40 | } 41 | 42 | return &s 43 | } 44 | 45 | // Load invokes the mocked Load function to return the predefined error / result. 46 | func (s *MockSpec) Load() error { 47 | return s.MockLoad.call() 48 | } 49 | 50 | // Flush invokes the mocked Load function to return the predefined error / result. 51 | func (s *MockSpec) Flush() error { 52 | return s.MockFlush.call() 53 | } 54 | 55 | // Modify applies the specified SpecModifier to the spec and invokes the 56 | // mocked modify function to return the predefined error / result. 57 | func (s *MockSpec) Modify(f SpecModifier) error { 58 | f(s.Spec) 59 | return s.MockModify.call() 60 | } 61 | 62 | type mockFunc struct { 63 | Callcount int 64 | result error 65 | } 66 | 67 | func (m *mockFunc) call() error { 68 | m.Callcount++ 69 | return m.result 70 | } 71 | -------------------------------------------------------------------------------- /pkg/scheduler/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package config 18 | 19 | import "github.com/Project-HAMi/HAMi/pkg/util" 20 | 21 | var ( 22 | QPS float32 23 | Burst int 24 | Timeout int 25 | HTTPBind string 26 | SchedulerName string 27 | MetricsBindAddress string 28 | 29 | DefaultMem int32 30 | DefaultCores int32 31 | DefaultResourceNum int32 32 | 33 | // NodeSchedulerPolicy is config this scheduler node to use `binpack` or `spread`. default value is binpack. 34 | NodeSchedulerPolicy = util.NodeSchedulerPolicyBinpack.String() 35 | // GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread. 36 | GPUSchedulerPolicy = util.GPUSchedulerPolicySpread.String() 37 | 38 | // NodeLabelSelector is scheduler filter node by node label. 39 | NodeLabelSelector map[string]string 40 | ) 41 | -------------------------------------------------------------------------------- /pkg/scheduler/policy/constant.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package policy 18 | 19 | type SchedulerPolicyName string 20 | 21 | const ( 22 | // NodeSchedulerPolicyBinpack is node use binpack scheduler policy. 23 | NodeSchedulerPolicyBinpack SchedulerPolicyName = "binpack" 24 | // NodeSchedulerPolicySpread is node use spread scheduler policy. 25 | NodeSchedulerPolicySpread SchedulerPolicyName = "spread" 26 | // GPUSchedulerPolicyBinpack is GPU use binpack scheduler. 27 | GPUSchedulerPolicyBinpack SchedulerPolicyName = "binpack" 28 | // GPUSchedulerPolicySpread is GPU use spread scheduler. 29 | GPUSchedulerPolicySpread SchedulerPolicyName = "spread" 30 | ) 31 | 32 | func (s SchedulerPolicyName) String() string { 33 | return string(s) 34 | } 35 | 36 | const ( 37 | // NodeSchedulerPolicyAnnotationKey is user set Pod annotation to change this default node policy. 38 | NodeSchedulerPolicyAnnotationKey = "hami.io/node-scheduler-policy" 39 | // GPUSchedulerPolicyAnnotationKey is user set Pod annotation to change this default GPU policy. 40 | GPUSchedulerPolicyAnnotationKey = "hami.io/gpu-scheduler-policy" 41 | ) 42 | 43 | const ( 44 | Weight int = 10 45 | ) 46 | -------------------------------------------------------------------------------- /pkg/scheduler/policy/gpu_policy.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package policy 18 | 19 | import ( 20 | "github.com/Project-HAMi/HAMi/pkg/util" 21 | 22 | "k8s.io/klog/v2" 23 | ) 24 | 25 | type DeviceListsScore struct { 26 | Device *util.DeviceUsage 27 | // Score recode every device user/allocate score 28 | Score float32 29 | } 30 | 31 | type DeviceUsageList struct { 32 | DeviceLists []*DeviceListsScore 33 | Policy string 34 | } 35 | 36 | func (l DeviceUsageList) Len() int { 37 | return len(l.DeviceLists) 38 | } 39 | 40 | func (l DeviceUsageList) Swap(i, j int) { 41 | l.DeviceLists[i], l.DeviceLists[j] = l.DeviceLists[j], l.DeviceLists[i] 42 | } 43 | 44 | func (l DeviceUsageList) Less(i, j int) bool { 45 | if l.Policy == util.GPUSchedulerPolicyBinpack.String() { 46 | if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa { 47 | return l.DeviceLists[i].Score < l.DeviceLists[j].Score 48 | } 49 | return l.DeviceLists[i].Device.Numa > l.DeviceLists[j].Device.Numa 50 | } 51 | // default policy is spread 52 | if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa { 53 | return l.DeviceLists[i].Score > l.DeviceLists[j].Score 54 | } 55 | return l.DeviceLists[i].Device.Numa < l.DeviceLists[j].Device.Numa 56 | } 57 | 58 | func (ds *DeviceListsScore) ComputeScore(requests util.ContainerDeviceRequests) { 59 | request, core, mem := int32(0), int32(0), int32(0) 60 | // Here we are required to use the same type device 61 | for _, container := range requests { 62 | request += container.Nums 63 | core += container.Coresreq 64 | if container.MemPercentagereq != 0 && container.MemPercentagereq != 101 { 65 | mem += ds.Device.Totalmem * (container.MemPercentagereq / 100.0) 66 | continue 67 | } 68 | mem += container.Memreq 69 | } 70 | klog.V(2).Infof("device %s user %d, userCore %d, userMem %d,", ds.Device.ID, ds.Device.Used, ds.Device.Usedcores, ds.Device.Usedmem) 71 | 72 | usedScore := float32(request+ds.Device.Used) / float32(ds.Device.Count) 73 | coreScore := float32(core+ds.Device.Usedcores) / float32(ds.Device.Totalcore) 74 | memScore := float32(mem+ds.Device.Usedmem) / float32(ds.Device.Totalmem) 75 | ds.Score = float32(Weight) * (usedScore + coreScore + memScore) 76 | klog.V(2).Infof("device %s computer score is %f", ds.Device.ID, ds.Score) 77 | } 78 | -------------------------------------------------------------------------------- /pkg/util/client/options.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package client 18 | 19 | import ( 20 | "time" 21 | 22 | "k8s.io/client-go/rest" 23 | ) 24 | 25 | // Option defines a function type for client configuration options. 26 | type Option func(*rest.Config) 27 | 28 | // Now we use the default values of kubernetes client, unless HAMi has specific requirements. 29 | const ( 30 | DefaultQPS float32 = rest.DefaultQPS 31 | DefaultBurst int = rest.DefaultBurst 32 | DefaultTimeout int = 0 // seconds, 0 means no timeout, follow the default behavior of kubernetes client. 33 | ) 34 | 35 | // WithQPS sets the QPS for the client. 36 | func WithQPS(qps float32) Option { 37 | return func(c *rest.Config) { 38 | c.QPS = qps 39 | } 40 | } 41 | 42 | // WithBurst sets the burst for the client. 43 | func WithBurst(burst int) Option { 44 | return func(c *rest.Config) { 45 | c.Burst = burst 46 | } 47 | } 48 | 49 | // WithTimeout sets the timeout for the client. 50 | func WithTimeout(timeout int) Option { 51 | return func(c *rest.Config) { 52 | c.Timeout = time.Duration(timeout) * time.Second 53 | } 54 | } 55 | 56 | // WithDefaults sets default values for the client configuration. 57 | func WithDefaults() Option { 58 | return func(c *rest.Config) { 59 | if c.QPS == 0 { 60 | c.QPS = DefaultQPS 61 | } 62 | if c.Burst == 0 { 63 | c.Burst = DefaultBurst 64 | } 65 | if c.Timeout == 0 { 66 | c.Timeout = time.Duration(DefaultTimeout) * time.Second 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /pkg/util/client/testdata/invalid_kubeconfig.yaml: -------------------------------------------------------------------------------- 1 | # testdata/invalid_kubeconfig.yaml 2 | apiVersion: v1 3 | kind: Config 4 | clusters: 5 | - cluster: 6 | # Missing server field or invalid URL 7 | # server: http://invalid-url/ 8 | name: broken-cluster 9 | insecure-skip-tls-verify: true 10 | users: 11 | - name: broken-user 12 | user: 13 | # Invalid or missing token 14 | token: not-a-valid-token 15 | contexts: 16 | - context: 17 | cluster: non-existent-cluster 18 | user: non-existent-user 19 | name: broken-context 20 | # Missing current-context -------------------------------------------------------------------------------- /pkg/util/client/testdata/kubeconfig.yaml: -------------------------------------------------------------------------------- 1 | # testdata/kubeconfig.yaml 2 | apiVersion: v1 3 | kind: Config 4 | clusters: 5 | - cluster: 6 | server: https://example.com 7 | insecure-skip-tls-verify: true 8 | name: example-cluster 9 | users: 10 | - name: example-user 11 | user: 12 | token: my-token-value 13 | contexts: 14 | - context: 15 | cluster: example-cluster 16 | user: example-user 17 | name: example-context 18 | current-context: example-context -------------------------------------------------------------------------------- /pkg/util/flag/flags.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package flag 18 | 19 | import ( 20 | "github.com/spf13/pflag" 21 | "github.com/urfave/cli/v2" 22 | "k8s.io/klog/v2" 23 | ) 24 | 25 | func PrintPFlags(flags *pflag.FlagSet) { 26 | flags.VisitAll(func(flag *pflag.Flag) { 27 | klog.Infof("FLAG: --%s=%q", flag.Name, flag.Value) 28 | }) 29 | } 30 | 31 | func PrintCliFlags(c *cli.Context) { 32 | for _, flag := range c.App.Flags { 33 | names := flag.Names() 34 | for _, name := range names { 35 | value := c.Generic(name) 36 | klog.Infof("FLAG: --%s=%q\n", name, value) 37 | } 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /pkg/util/flag/flags_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package flag 18 | 19 | import ( 20 | "bytes" 21 | "flag" 22 | "strings" 23 | "testing" 24 | 25 | "github.com/spf13/pflag" 26 | "github.com/urfave/cli/v2" 27 | "k8s.io/klog/v2" 28 | ) 29 | 30 | func TestPrintPFlags(t *testing.T) { 31 | var buf bytes.Buffer 32 | klog.SetOutput(&buf) 33 | klog.LogToStderr(false) 34 | defer klog.LogToStderr(true) 35 | tests := []struct { 36 | name string 37 | flags func() *pflag.FlagSet 38 | expected string 39 | }{ 40 | { 41 | name: "Test with name flags", 42 | flags: func() *pflag.FlagSet { 43 | fs := pflag.NewFlagSet("test", pflag.ContinueOnError) 44 | fs.String("name", "bob", "set name") 45 | return fs 46 | }, 47 | expected: `FLAG: --name="bob"`, 48 | }, 49 | } 50 | 51 | for _, tt := range tests { 52 | t.Run(tt.name, func(t *testing.T) { 53 | buf.Reset() 54 | PrintPFlags(tt.flags()) 55 | if got := buf.String(); !strings.Contains(got, tt.expected) { 56 | t.Errorf("PrintPFlags() = %q, want %q", got, tt.expected) 57 | } 58 | }) 59 | } 60 | } 61 | 62 | func TestPrintCliFlags(t *testing.T) { 63 | var buf bytes.Buffer 64 | klog.SetOutput(&buf) 65 | klog.LogToStderr(false) 66 | defer klog.LogToStderr(true) 67 | 68 | tests := []struct { 69 | name string 70 | cliCtx func() *cli.Context 71 | expected string 72 | }{ 73 | { 74 | name: "Test with name flag", 75 | cliCtx: func() *cli.Context { 76 | app := &cli.App{ 77 | Flags: []cli.Flag{ 78 | &cli.StringFlag{ 79 | Name: "name", 80 | Value: "bob", 81 | Usage: "set user name", 82 | }, 83 | }, 84 | } 85 | flagSet := flag.NewFlagSet("test", flag.ContinueOnError) 86 | flagSet.String("name", "bob", "") 87 | return cli.NewContext(app, flagSet, nil) 88 | }, 89 | expected: `FLAG: --name="bob" 90 | `, 91 | }, 92 | } 93 | 94 | for _, tt := range tests { 95 | t.Run(tt.name, func(t *testing.T) { 96 | buf.Reset() 97 | PrintCliFlags(tt.cliCtx()) 98 | got := buf.String() 99 | if !strings.Contains(got, tt.expected) { 100 | t.Errorf("PrintCliFlags() output = %q, want %q", got, tt.expected) 101 | } 102 | }) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /pkg/version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "fmt" 21 | 22 | "github.com/spf13/cobra" 23 | ) 24 | 25 | var ( 26 | version string 27 | VersionCmd = &cobra.Command{ 28 | Use: "version", 29 | Short: "print version", 30 | Run: func(cmd *cobra.Command, args []string) { 31 | fmt.Println(Version()) 32 | }, 33 | } 34 | ) 35 | 36 | func Version() string { 37 | return version 38 | } 39 | -------------------------------------------------------------------------------- /pkg/version/version_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "bytes" 21 | "io" 22 | "os" 23 | "testing" 24 | 25 | "gotest.tools/v3/assert" 26 | ) 27 | 28 | func TestVersion(t *testing.T) { 29 | version = "v1.0.0.1234567890" 30 | versionWant := "v1.0.0.1234567890\n" 31 | 32 | var out bytes.Buffer 33 | r, w, err := os.Pipe() 34 | if err != nil { 35 | t.Fatalf("os.Pipe() failed: %v", err) 36 | } 37 | defer r.Close() 38 | originalStdout := os.Stdout 39 | defer func() { 40 | os.Stdout = originalStdout 41 | w.Close() 42 | }() 43 | os.Stdout = w 44 | 45 | VersionCmd.Run(nil, nil) 46 | w.Close() 47 | 48 | io.Copy(&out, r) 49 | 50 | versionGet := out.String() 51 | assert.Equal(t, versionWant, versionGet) 52 | } 53 | -------------------------------------------------------------------------------- /test/e2e/node/test_suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "flag" 21 | "testing" 22 | 23 | "github.com/onsi/ginkgo/v2" 24 | "github.com/onsi/gomega" 25 | ) 26 | 27 | func init() { 28 | testing.Init() 29 | flag.Parse() 30 | } 31 | 32 | func TestInit(t *testing.T) { 33 | gomega.RegisterFailHandler(ginkgo.Fail) 34 | ginkgo.RunSpecs(t, "Test workspace Service Suite") 35 | } 36 | -------------------------------------------------------------------------------- /test/e2e/pod/test_suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "flag" 21 | "testing" 22 | 23 | "github.com/onsi/ginkgo/v2" 24 | "github.com/onsi/gomega" 25 | ) 26 | 27 | func init() { 28 | testing.Init() 29 | flag.Parse() 30 | } 31 | 32 | func TestInit(t *testing.T) { 33 | gomega.RegisterFailHandler(ginkgo.Fail) 34 | ginkgo.RunSpecs(t, "Test pod") 35 | } 36 | -------------------------------------------------------------------------------- /test/e2e/test_suite_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package e2e 18 | 19 | import ( 20 | "flag" 21 | "testing" 22 | 23 | "github.com/onsi/ginkgo/v2" 24 | "github.com/onsi/gomega" 25 | 26 | "github.com/Project-HAMi/HAMi/test/utils" 27 | ) 28 | 29 | func init() { 30 | testing.Init() 31 | } 32 | 33 | func TestInit(t *testing.T) { 34 | flag.Parse() 35 | utils.DefaultKubeConfigPath() 36 | gomega.RegisterFailHandler(ginkgo.Fail) 37 | ginkgo.RunSpecs(t, "HAMi E2E Test Suite") 38 | } 39 | -------------------------------------------------------------------------------- /test/utils/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | // test data. 20 | const ( 21 | GPUNodeLabelKey = "gpu" 22 | GPUNodeLabelValue = "on" 23 | GPUExecuteNvidiaSMI = "nvidia-smi" 24 | GPUExecuteCudaSample = "/cuda-samples/sample" 25 | GPUPodMemory = "300" 26 | GPUPodMemoryUnit = "MiB" 27 | GPUPodCore = "40" 28 | GPUNameSpace = "hami-system" 29 | GPUNode = "gpu-master" 30 | GPUCudaTestPass = "Test PASSED" 31 | ) 32 | 33 | // hami related. 34 | const ( 35 | HamiScheduler = "hami-scheduler" 36 | HamiDevicePlugin = "hami-device-plugin" 37 | ErrReasonFilteringFailed = "FilteringFailed" 38 | ErrMessageFilteringFailed = "no available node" 39 | ErrReasonFailedScheduling = "FilteringFailed" 40 | ErrMessageFailedScheduling = "0/1 nodes are available" 41 | ) 42 | -------------------------------------------------------------------------------- /test/utils/event.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | 23 | v1 "k8s.io/api/core/v1" 24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | "k8s.io/client-go/kubernetes" 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | func GetEvents(clientSet *kubernetes.Clientset, namespace string, listOptions metav1.ListOptions) ([]v1.Event, error) { 30 | events, err := clientSet.CoreV1().Events(namespace).List(context.TODO(), listOptions) 31 | if err != nil { 32 | return nil, err 33 | } 34 | 35 | return events.Items, nil 36 | } 37 | 38 | func GetPodEvents(clientSet *kubernetes.Clientset, namespace, podName string) ([]v1.Event, error) { 39 | listOption := metav1.ListOptions{ 40 | FieldSelector: fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s", podName), 41 | } 42 | 43 | events, err := GetEvents(clientSet, namespace, listOption) 44 | if err != nil { 45 | klog.Errorf("Failed to list events for pod %s in namespace %s: %v", podName, namespace, err) 46 | return nil, err 47 | } 48 | 49 | return events, nil 50 | } 51 | -------------------------------------------------------------------------------- /test/utils/node.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The HAMi Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | import ( 20 | "context" 21 | 22 | v1 "k8s.io/api/core/v1" 23 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 | "k8s.io/client-go/kubernetes" 25 | "k8s.io/klog/v2" 26 | ) 27 | 28 | func GetNodes(clientSet *kubernetes.Clientset) (*v1.NodeList, error) { 29 | nodes, err := clientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) 30 | if err != nil { 31 | klog.Errorf("Failed to get nodes: %v", err) 32 | return nil, err 33 | } 34 | 35 | return nodes, nil 36 | } 37 | 38 | func UpdateNode(clientSet *kubernetes.Clientset, node *v1.Node) (*v1.Node, error) { 39 | updatedNode, err := clientSet.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) 40 | if err != nil { 41 | klog.Errorf("Failed to update node %s: %v", node.Name, err) 42 | return nil, err 43 | } 44 | 45 | return updatedNode, nil 46 | } 47 | 48 | func AddNodeLabel(clientSet *kubernetes.Clientset, nodeName, labelKey, labelValue string) (*v1.Node, error) { 49 | node, err := clientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) 50 | if err != nil { 51 | return nil, err 52 | } 53 | 54 | if node.Labels == nil { 55 | node.Labels = make(map[string]string) 56 | } 57 | node.Labels[labelKey] = labelValue 58 | 59 | return UpdateNode(clientSet, node) 60 | } 61 | 62 | func RemoveNodeLabel(clientSet *kubernetes.Clientset, nodeName, labelKey string) (*v1.Node, error) { 63 | node, err := clientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | if node.Labels != nil { 69 | delete(node.Labels, labelKey) 70 | } 71 | 72 | return UpdateNode(clientSet, node) 73 | } 74 | -------------------------------------------------------------------------------- /version.mk: -------------------------------------------------------------------------------- 1 | GO=go 2 | GO111MODULE=on 3 | CMDS=scheduler vGPUmonitor 4 | DEVICES=nvidia 5 | OUTPUT_DIR=bin 6 | TARGET_ARCH=amd64 7 | GOLANG_IMAGE=golang:1.22.5-bullseye 8 | NVIDIA_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu20.04 9 | DEST_DIR=/usr/local/vgpu/ 10 | 11 | VERSION = v0.0.1 12 | IMG_NAME =hami 13 | IMG_TAG="${IMG_NAME}:${VERSION}" --------------------------------------------------------------------------------