├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── config.yml
    │   ├── enhancement.md
    │   ├── good-first.md
    │   └── question.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    ├── labeler.yml
    ├── release.yml
    └── workflows
    │   ├── auto-label-pr.yaml
    │   ├── auto-release.yaml
    │   ├── call-e2e-upgrade.yaml
    │   ├── call-e2e.yaml
    │   ├── call-release-helm.yaml
    │   ├── call-release-image-hamicore.yaml
    │   ├── call-release-image.yaml
    │   ├── call-release-notes.yaml
    │   ├── call-release-website.yaml
    │   ├── ci-image-scanning.yaml
    │   ├── ci.yaml
    │   ├── codeql-analysis.yml
    │   ├── lint-chart.yaml
    │   └── test-self-hosted.yaml
├── .gitignore
├── .gitmodules
├── .golangci.yaml
├── .trivyignore
├── AUTHORS.md
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DEPENDENCY.md
├── HAMi.jpg
├── LICENSE
├── MAINTAINERS.md
├── Makefile
├── Makefile.defs
├── NOTICE.txt
├── OWNERS
├── README.md
├── README_cn.md
├── SECURITY.md
├── VERSION
├── benchmarks
    ├── README.md
    ├── ai-benchmark
    │   ├── Dockerfile
    │   └── build.sh
    └── deployments
    │   ├── job-on-hami.yml
    │   └── job-on-nvidia-device-plugin.yml
├── charts
    ├── Makefile
    └── hami
    │   ├── Chart.yaml
    │   ├── templates
    │       ├── NOTES.txt
    │       ├── _helpers.tpl
    │       ├── device-plugin
    │       │   ├── configmap.yaml
    │       │   ├── daemonsetnvidia.yaml
    │       │   ├── monitorrole.yaml
    │       │   ├── monitorrolebinding.yaml
    │       │   ├── monitorservice.yaml
    │       │   ├── monitorserviceaccount.yaml
    │       │   └── runtime-class.yaml
    │       └── scheduler
    │       │   ├── certmanager.yaml
    │       │   ├── configmap.yaml
    │       │   ├── configmapnew.yaml
    │       │   ├── deployment.yaml
    │       │   ├── device-configmap.yaml
    │       │   ├── job-patch
    │       │       ├── clusterrole.yaml
    │       │       ├── clusterrolebinding.yaml
    │       │       ├── job-createSecret.yaml
    │       │       ├── job-patchWebhook.yaml
    │       │       ├── psp.yaml
    │       │       ├── role.yaml
    │       │       ├── rolebinding.yaml
    │       │       └── serviceaccount.yaml
    │       │   ├── rolebinding.yaml
    │       │   ├── service.yaml
    │       │   ├── serviceaccount.yaml
    │       │   └── webhook.yaml
    │   └── values.yaml
├── cmd
    ├── device-plugin
    │   └── nvidia
    │   │   ├── main.go
    │   │   ├── plugin-manager.go
    │   │   ├── vgpucfg.go
    │   │   └── watchers.go
    ├── scheduler
    │   ├── main.go
    │   └── metrics.go
    └── vGPUmonitor
    │   ├── build.sh
    │   ├── feedback.go
    │   ├── main.go
    │   ├── metrics.go
    │   ├── noderpc
    │       ├── noderpc.pb.go
    │       ├── noderpc.proto
    │       └── noderpc_grpc.pb.go
    │   ├── testcollector
    │       └── main.go
    │   └── validation.go
├── docker
    ├── Dockerfile
    ├── Dockerfile.hamicore
    ├── Dockerfile.hamimaster
    ├── Dockerfile.withlib
    ├── entrypoint.sh
    └── vgpu-init.sh
├── docs
    ├── CHANGELOG
    │   └── CHANGELOG-0.0.0.md
    ├── ascend910b-support.md
    ├── ascend910b-support_cn.md
    ├── benchmark.md
    ├── benchmark_cn.md
    ├── cambricon-mlu-support.md
    ├── cambricon-mlu-support_cn.md
    ├── config.md
    ├── config_cn.md
    ├── dashboard.md
    ├── dashboard_cn.md
    ├── develop
    │   ├── design.md
    │   ├── dynamic-mig.md
    │   ├── imgs
    │   │   ├── flowchart.jpeg
    │   │   ├── gpu-scheduler-policy-demo.png
    │   │   ├── hami-dynamic-mig-procedure.png
    │   │   ├── hami-dynamic-mig-structure.png
    │   │   ├── node-shceduler-policy-demo.png
    │   │   ├── offline_validation.png
    │   │   ├── protocol_pod.png
    │   │   ├── protocol_register.png
    │   │   └── scheduler-policy-story.png
    │   ├── protocol.md
    │   ├── roadmap.md
    │   ├── scheduler-policy.md
    │   └── tasklist.md
    ├── dynamic-mig-support.md
    ├── dynamic-mig-support_cn.md
    ├── enflame-gcu-suport.md
    ├── enflame-gcu-support_cn.md
    ├── gpu-dashboard.json
    ├── how-to-profiling-scheduler.md
    ├── how-to-profiling-scheduler_cn.md
    ├── how-to-use-volcano-vgpu.md
    ├── hygon-dcu-support.md
    ├── hygon-dcu-support_cn.md
    ├── iluvatar-gpu-support.md
    ├── iluvatar-gpu-support_cn.md
    ├── metax-support.md
    ├── metax-support_cn.md
    ├── mind-map
    │   ├── HAMI-VGPU-mind-map-Chinese.png
    │   ├── HAMI-VGPU-mind-map-Chinese.xmind
    │   ├── HAMI-VGPU-mind-map-English.png
    │   ├── HAMI-VGPU-mind-map-English.xmind
    │   └── readme
    ├── mthreads-support.md
    ├── mthreads-support_cn.md
    ├── offline-install.md
    ├── proposals
    │   ├── e2e_test.md
    │   ├── e2e_test.png
    │   ├── gpu-topo-policy.md
    │   ├── gpu_utilization.png
    │   └── gpu_utilization_cn.md
    ├── release-process.md
    └── scheduler-event-log.md
├── example.yaml
├── examples
    ├── ascend
    │   ├── job-310P.yaml
    │   ├── job-910A.yaml
    │   ├── job-910B2.yaml
    │   ├── job-910B3.yaml
    │   └── job-910B4.yaml
    ├── enflame
    │   ├── default_use.yaml
    │   └── use_exclusive.yaml
    ├── hygon
    │   ├── default_use.yaml
    │   ├── specify_card_type_not_use.yaml
    │   └── specify_card_type_to_use.yaml
    ├── iluvatar
    │   ├── default_use.yaml
    │   ├── multi-containers.yaml
    │   └── multi-devices.yaml
    ├── metax
    │   ├── gpu
    │   │   ├── binpack.yaml
    │   │   ├── default_use.yaml
    │   │   └── spread.yaml
    │   └── sgpu
    │   │   ├── allocate_exclusive.yaml
    │   │   ├── allocate_specific_gpu.yaml
    │   │   ├── allocate_vmemory_MiB.yaml
    │   │   ├── default_use.yaml
    │   │   └── multi-containers.yaml
    ├── mlu
    │   ├── allocate_whole.yaml
    │   └── default_use.yaml
    ├── mthreads
    │   ├── default_use.yaml
    │   ├── multi_cards.yaml
    │   └── use_exclusive.yaml
    └── nvidia
    │   ├── default_use.yaml
    │   ├── default_use_legacy.yaml
    │   ├── dynamic_mig_example.yaml
    │   ├── example.yaml
    │   ├── mig_example.yaml
    │   ├── specify_card_type_not_use.yaml
    │   ├── specify_card_type_to_use.yaml
    │   ├── specify_scheduling_policy.yaml
    │   ├── specify_uuid_not_use.yaml
    │   ├── specify_uuid_to_use.yaml
    │   ├── use_as_normal.yaml
    │   ├── use_exclusive_card.yaml
    │   ├── use_memory_fraction.yaml
    │   └── use_sharing_card.yaml
├── go.mod
├── go.sum
├── hack
    ├── .import-aliases
    ├── boilerplate
    │   └── boilerplate.go.txt
    ├── build.sh
    ├── deploy-helm.sh
    ├── e2e-test-setup.sh
    ├── e2e-test.sh
    ├── kubeconfig-demo.yaml
    ├── tools
    │   ├── preferredimports
    │   │   └── preferredimports.go
    │   └── tools.go
    ├── unit-test.sh
    ├── update-generated-api.sh
    ├── util.sh
    ├── verify-all.sh
    ├── verify-chart-version.sh
    ├── verify-import-aliases.sh
    ├── verify-license.sh
    └── verify-staticcheck.sh
├── imgs
    ├── arch.png
    ├── benchmark.png
    ├── benchmark_inf.png
    ├── benchmark_train.png
    ├── cncf-logo.png
    ├── example.png
    ├── hami-arch.png
    ├── hami-arch.pptx
    ├── hami-graph-color.png
    ├── hami-horizontal-colordark.png
    ├── hami-vgpu-metrics-dashboard.png
    ├── hard_limit.jpg
    ├── metax_binpack.png
    ├── metax_spread.png
    ├── metax_topo.png
    └── release-process.png
├── lib
    └── nvidia
    │   └── ld.so.preload
├── pkg
    ├── device-plugin
    │   └── nvidiadevice
    │   │   └── nvinternal
    │   │       ├── cdi
    │   │           ├── api.go
    │   │           ├── api_mock.go
    │   │           ├── cdi.go
    │   │           ├── factory.go
    │   │           ├── null.go
    │   │           └── options.go
    │   │       ├── info
    │   │           └── version.go
    │   │       ├── mig
    │   │           └── mig.go
    │   │       ├── plugin
    │   │           ├── api.go
    │   │           ├── manager
    │   │           │   ├── api.go
    │   │           │   ├── factory.go
    │   │           │   ├── null.go
    │   │           │   ├── nvml.go
    │   │           │   ├── options.go
    │   │           │   └── tegra.go
    │   │           ├── register.go
    │   │           ├── register_test.go
    │   │           ├── server.go
    │   │           ├── server_test.go
    │   │           ├── util.go
    │   │           └── util_test.go
    │   │       └── rm
    │   │           ├── allocate.go
    │   │           ├── device_map.go
    │   │           ├── device_map_test.go
    │   │           ├── devices.go
    │   │           ├── health.go
    │   │           ├── health_test.go
    │   │           ├── helper.go
    │   │           ├── nvml_devices.go
    │   │           ├── nvml_devices_test.go
    │   │           ├── nvml_manager.go
    │   │           ├── rm.go
    │   │           ├── tegra_devices.go
    │   │           ├── tegra_manager.go
    │   │           └── wsl_devices.go
    ├── device
    │   ├── ascend
    │   │   ├── device.go
    │   │   ├── device_test.go
    │   │   └── vnpu.go
    │   ├── cambricon
    │   │   ├── device.go
    │   │   └── device_test.go
    │   ├── devices.go
    │   ├── devices_test.go
    │   ├── enflame
    │   │   ├── device.go
    │   │   └── device_test.go
    │   ├── hygon
    │   │   ├── device.go
    │   │   └── device_test.go
    │   ├── iluvatar
    │   │   ├── device.go
    │   │   └── device_test.go
    │   ├── metax
    │   │   ├── config.go
    │   │   ├── device.go
    │   │   ├── device_test.go
    │   │   ├── protocol.go
    │   │   ├── protocol_test.go
    │   │   ├── sdevice.go
    │   │   └── sdevice_test.go
    │   ├── mthreads
    │   │   ├── device.go
    │   │   └── device_test.go
    │   └── nvidia
    │   │   ├── calculate_score.go
    │   │   ├── calculate_score_test.go
    │   │   ├── device.go
    │   │   ├── device_test.go
    │   │   └── links.go
    ├── k8sutil
    │   ├── pod.go
    │   └── pod_test.go
    ├── monitor
    │   └── nvidia
    │   │   ├── cudevshr.go
    │   │   ├── v0
    │   │       ├── spec.go
    │   │       └── spec_test.go
    │   │   └── v1
    │   │       ├── spec.go
    │   │       └── spec_test.go
    ├── oci
    │   ├── runtime.go
    │   ├── runtime_exec.go
    │   ├── runtime_exec_test.go
    │   ├── runtime_mock.go
    │   ├── spec.go
    │   └── spec_mock.go
    ├── scheduler
    │   ├── config
    │   │   └── config.go
    │   ├── event.go
    │   ├── event_test.go
    │   ├── nodes.go
    │   ├── nodes_test.go
    │   ├── pod_test.go
    │   ├── pods.go
    │   ├── policy
    │   │   ├── constant.go
    │   │   ├── gpu_policy.go
    │   │   ├── gpu_policy_test.go
    │   │   ├── node_policy.go
    │   │   └── node_policy_test.go
    │   ├── routes
    │   │   └── route.go
    │   ├── scheduler.go
    │   ├── scheduler_test.go
    │   ├── score.go
    │   ├── score_test.go
    │   ├── webhook.go
    │   └── webhook_test.go
    ├── util
    │   ├── client
    │   │   ├── client.go
    │   │   ├── client_test.go
    │   │   ├── options.go
    │   │   └── testdata
    │   │   │   ├── invalid_kubeconfig.yaml
    │   │   │   └── kubeconfig.yaml
    │   ├── flag
    │   │   ├── flags.go
    │   │   └── flags_test.go
    │   ├── nodelock
    │   │   ├── nodelock.go
    │   │   └── nodelock_test.go
    │   ├── types.go
    │   ├── util.go
    │   └── util_test.go
    └── version
    │   ├── version.go
    │   └── version_test.go
├── test
    ├── e2e
    │   ├── node
    │   │   ├── test_node.go
    │   │   └── test_suite_test.go
    │   ├── pod
    │   │   ├── test_pod.go
    │   │   └── test_suite_test.go
    │   └── test_suite_test.go
    └── utils
    │   ├── common.go
    │   ├── config.go
    │   ├── event.go
    │   ├── node.go
    │   └── pod.go
└── version.mk


/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report a bug encountered while using HAMi.
 4 | labels: kind/bug
 5 | 
 6 | ---
 7 | 
 8 | <!-- Please use this template while reporting a bug and provide as much info as possible. Not doing so may result in your bug not being addressed in a timely manner. Thanks!
 9 | -->
10 | 
11 | **What happened**:
12 | 
13 | **What you expected to happen**:
14 | 
15 | **How to reproduce it (as minimally and precisely as possible)**:
16 | 
17 | **Anything else we need to know?**:
18 | 
19 | - The output of `nvidia-smi -a` on your host
20 | - Your docker or containerd configuration file (e.g: `/etc/docker/daemon.json`)
21 | - The hami-device-plugin container logs
22 | - The hami-scheduler container logs
23 | - The kubelet logs on the node (e.g: `sudo journalctl -r -u kubelet`)
24 | - Any relevant kernel output lines from `dmesg`
25 | 
26 | **Environment**:
27 | - HAMi version:
28 | - nvidia driver or other AI device driver version:
29 | - Docker version from `docker version`
30 | - Docker command, image and tag used
31 | - Kernel version from `uname -a`
32 | - Others:
33 | 
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | contact_links:
2 |   - name: FAQ
3 |     url: https://github.com/Project-HAMi/HAMi/issues/646
4 |     about: Frequently asked questions and common solutions.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Enhancement Request
 3 | about: Suggest an enhancement to the project
 4 | labels: kind/feature
 5 | 
 6 | ---
 7 | <!-- Please only use this template for submitting enhancement requests -->
 8 | 
 9 | **What would you like to be added**:
10 | 
11 | **What type of PR is this?**
12 | 
13 | /kind feature
14 | 
15 | **What this PR does / why we need it**:
16 | 
17 | **Which issue(s) this PR fixes**:
18 | Fixes #
19 | 
20 | **Special notes for your reviewer**:
21 | 
22 | **Does this PR introduce a user-facing change?**:


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/good-first.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Good First Issue
 3 | about: Publish a good first issue
 4 | labels: good first issue
 5 | 
 6 | ---
 7 | 
 8 | <!-- Please use this template while publishing a good first issue. Thanks!
 9 | -->
10 | 
11 | **Task description**:
12 | 
13 | **Solution**:
14 | 
15 | **Who can join or take the task**:
16 | 
17 | The good first issue is intended for `first-time contributors` to get started on his/her contributor journey.
18 | 
19 | After a contributor has successfully completed 1-2 good first issue's,
20 | they should be ready to move on to `help wanted` items, saving the remaining `good first issue` for other new contributors.
21 | 
22 | **How to join or take the task**:
23 | 
24 | Just reply on the issue with the message `/assign` in a separate line.
25 | 
26 | Then, the issue will be assigned to you.
27 | 
28 | **How to ask for help**:
29 | 
30 | If you need help or have questions, please feel free to ask on this issue.
31 | The issue author or other members of the community will guide you through the contribution process.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Question relating to HAMi.
 4 | labels: kind/question
 5 | 
 6 | ---
 7 | 
 8 | **Please provide an in-depth description of the question you have**:
 9 | 
10 | **What do you think about this question?**:
11 | 
12 | **Environment**:
13 | - HAMi version:
14 | - Kubernetes version:
15 | - Others:


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | **What type of PR is this?**
 2 | 
 3 | <!--
 4 | Add one of the following kinds:
 5 | /kind bug
 6 | /kind cleanup
 7 | /kind deprecation
 8 | /kind design
 9 | /kind documentation
10 | /kind failing-test
11 | /kind feature
12 | /kind flake
13 | -->
14 | 
15 | **What this PR does / why we need it**:
16 | 
17 | **Which issue(s) this PR fixes**:
18 | Fixes #
19 | 
20 | **Special notes for your reviewer**:
21 | 
22 | **Does this PR introduce a user-facing change?**:


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # To get started with Dependabot version updates, you'll need to specify which
 3 | # package ecosystems to update and where the package manifests are located.
 4 | # Please see the documentation for all configuration options:
 5 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 6 | 
 7 | 
 8 | version: 2
 9 | updates:
10 |   - package-ecosystem: "gomod"
11 |     directory: "/"
12 |     schedule:
13 |       interval: "daily"
14 |   - package-ecosystem: "docker"
15 |     directory: "/docker"
16 |     schedule:
17 |       interval: "daily"
18 |   - package-ecosystem: "github-actions"
19 |     directory: "/"
20 |     schedule:
21 |       interval: "daily" 
22 | 


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
 1 | "kind/bug":
 2 |   - '^[Ff]ix(\(.*\))?:?.*'
 3 | "kind/cleanup":
 4 |   - '^[Cc]hore(\(.*\))?:?.*'
 5 | "kind/documentation":
 6 |   - '^[Dd]ocs?(\(.*\))?:?.*'
 7 | "kind/enhancement":
 8 |   - '^[Rr]efactor(\(.*\))?:?.*'
 9 | "kind/feature":
10 |   - '^[Ff]eat(\(.*\))?:?.*'
11 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
 1 | # .github/release.yml
 2 | changelog:
 3 |   exclude:
 4 |     labels:
 5 |     - ignore-for-release
 6 |     - github-actions
 7 |     authors:
 8 |     - dependabot[bot]
 9 |   categories:
10 |   - title: ✨ New Features
11 |     labels:
12 |     - feature
13 |     - design
14 |     - enhancement
15 |   - title: 🐛 Bug Fixes
16 |     labels:
17 |     - bug
18 |   - title: 📚 Documentation
19 |     labels:
20 |     - documentation
21 |   - title: ⬆️ Dependencies
22 |     labels:
23 |     - dependencies
24 |   - title: 💥 Breaking Changes
25 |     labels:
26 |     - breaking-change
27 |   - title: 🔨 Other Changes
28 |     labels:
29 |     - "*"
30 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-label-pr.yaml:
--------------------------------------------------------------------------------
 1 | name: "PR Labeler"
 2 | on:
 3 |   pull_request_target:
 4 |     types: [opened, edited]
 5 | 
 6 | permissions:
 7 |   issues: write
 8 |   pull-requests: write
 9 |   contents: read
10 | 
11 | jobs:
12 |    labeling:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: github/issue-labeler@v3.4
16 |       with:
17 |         configuration-path: .github/labeler.yml
18 |         enable-versioned-regex: 0
19 |         sync-labels: 1
20 |         include-title: 1
21 |         include-body: 0
22 |         repo-token: ${{ github.token }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/call-e2e-upgrade.yaml:
--------------------------------------------------------------------------------
 1 | name: Call e2e upgrade test
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       ref:
 7 |         required: true
 8 |         type: string
 9 | permissions: write-all
10 | 
11 | jobs:
12 |   upgrade-e2e:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: e2e upgrade test
16 |         # https://github.com/actions/virtual-environments/issues/709
17 |         run: |
18 |           echo "Need to add e2e upgrade test"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/call-e2e.yaml:
--------------------------------------------------------------------------------
 1 | name: Call e2e test
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       ref:
 7 |         description: 'Reference id to run tests'
 8 |         required: true
 9 |         type: string
10 |       type:
11 |         description: 'E2E type'
12 |         required: true
13 |         type: string
14 |         default: pullrequest
15 | 
16 | jobs:
17 |   e2e-test:
18 |     strategy:
19 |       matrix:
20 |         include:
21 |           - device: nvidia
22 |             type: tesla-p4
23 | #          - device: nvidia
24 | #            type: rtx-4090
25 | #          - device: huawei
26 | #            type: ascend-910b
27 |     runs-on: [ "${{ matrix.device }}", "${{ matrix.type }}" ]
28 |     environment: ${{ matrix.device }}
29 |     env:
30 |       E2E_TYPE: ${{ inputs.type }}
31 |       HAMI_VERSION: ${{ inputs.ref }}
32 |     steps:
33 |       - name: checkout code
34 |         uses: actions/checkout@v4
35 | 
36 |       - name: install Go
37 |         uses: actions/setup-go@v5
38 |         with:
39 |           go-version: "1.21"
40 | 
41 |       - name: setup e2e env
42 |         run: |
43 |           make e2e-env-setup
44 | 
45 |       - name: download hami helm
46 |         if: inputs.type == 'pullrequest'
47 |         uses: actions/download-artifact@v4
48 |         with:
49 |           name: chart_package_artifact
50 |           path: charts/
51 | 
52 |       - name: download hami image
53 |         if: inputs.type == 'pullrequest'
54 |         uses: actions/download-artifact@v4
55 |         with:
56 |           name: hami-image
57 |           path: ./image
58 | 
59 |       - name: load e2e image
60 |         if: inputs.type == 'pullrequest'
61 |         run: |
62 |           echo "Loading Docker image from image.tar..."
63 |           if [ -z "${VSPHERE_GPU_VM_IP}" ]; then
64 |             echo "Error: VSPHERE_GPU_VM_IP is not defined!"
65 |             exit 1
66 |           fi
67 |           scp ./image/image.tar root@$VSPHERE_GPU_VM_IP:/home/
68 |           ssh root@$VSPHERE_GPU_VM_IP "nerdctl load -i /home/image.tar"
69 |           ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami"
70 | 
71 |       - name: deploy hami helm
72 |         run: |
73 |           make helm-deploy
74 | 
75 |       - name: e2e test
76 |         run: |
77 |           make e2e-test
78 | 


--------------------------------------------------------------------------------
/.github/workflows/call-release-website.yaml:
--------------------------------------------------------------------------------
 1 | name: Call Release webiste
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       ref:
 7 |         required: true
 8 |         type: string
 9 | permissions: write-all
10 | 
11 | jobs:
12 |   build-website:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: release hami website
16 |         # https://github.com/actions/virtual-environments/issues/709
17 |         run: |
18 |           echo "Need to publish hami website"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | name: "CodeQL"
 7 | 
 8 | on:
 9 |   workflow_dispatch:
10 |   push:
11 |     branches: ["master","dev"]
12 |     paths-ignore:
13 |       - "**/*.json"
14 |       - "**/*.md"
15 |       - "**/*.txt"
16 |       - "**/*.yml"
17 |   schedule:
18 |     - cron: "0 4 * * 6"
19 | 
20 | permissions:
21 |   security-events: write
22 |   # required to fetch internal or private CodeQL packs
23 |   packages: read
24 | 
25 |   # only required for workflows in private repositories
26 |   actions: read
27 |   contents: read
28 | 
29 | jobs:
30 |   analyze:
31 |     name: Analyze
32 |     runs-on: ubuntu-latest
33 |     if: github.repository == 'Project-HAMi/HAMi'
34 | 
35 |     strategy:
36 |       fail-fast: false
37 |       matrix:
38 |         language: ["go"]
39 | 
40 |     steps:
41 |       - name: Checkout repository
42 |         uses: actions/checkout@v4
43 |       - name: Checkout submodule
44 |         uses: Mushus/checkout-submodule@v1.0.1
45 |         with:
46 |           basePath: # optional, default is .
47 |           submodulePath: libvgpu
48 |       - if: matrix.language == 'go'
49 |         name: Set go version
50 |         uses: actions/setup-go@v5
51 |         with:
52 |           go-version-file: go.mod
53 | 
54 |       # Initializes the CodeQL tools for scanning.
55 |       - name: Initialize CodeQL
56 |         uses: github/codeql-action/init@v3
57 |         with:
58 |           languages: ${{ matrix.language }}
59 |           # If you wish to specify custom queries, you can do so here or in a config file.
60 |           # By default, queries listed here will override any specified in a config file.
61 |           # Prefix the list here with "+" to use these queries and those in the config file.
62 |           # queries: ./path/to/local/query, your-org/your-repo/queries@main
63 | 
64 |       - name: Perform CodeQL Analysis
65 |         uses: github/codeql-action/analyze@v3
66 | 


--------------------------------------------------------------------------------
/.github/workflows/lint-chart.yaml:
--------------------------------------------------------------------------------
 1 | name: Chart Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     # Exclude branches created by Dependabot to avoid triggering current workflow
 6 |     # for PRs initiated by Dependabot.
 7 |     branches-ignore:
 8 |       - 'dependabot/**'
 9 |   pull_request:
10 |     paths:
11 |       - "charts/**"
12 | 
13 | jobs:
14 |   chart-lint-test:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout
18 |         uses: actions/checkout@v4
19 |         with:
20 |           fetch-depth: 0
21 | 
22 |       - name: Set up Helm
23 |         uses: azure/setup-helm@v4
24 |         with:
25 |           version: v3.7.1
26 |       - name: Lint Chart
27 |         run: |
28 |           make lint_chart
29 |       - name: Check chart version
30 |         run: bash ./hack/verify-chart-version.sh
31 | 
32 | 


--------------------------------------------------------------------------------
/.github/workflows/test-self-hosted.yaml:
--------------------------------------------------------------------------------
 1 | name: Test self-hosted-runner
 2 | 
 3 | on:
 4 |   push:
 5 |     # Exclude branches created by Dependabot to avoid triggering current workflow
 6 |     # for PRs initiated by Dependabot.
 7 |     branches-ignore:
 8 |       - 'dependabot/**'
 9 |   pull_request:
10 |     paths:
11 |       - "charts/**"
12 | 
13 | jobs:
14 |   e2e:
15 |     runs-on: self-hosted
16 |     steps:
17 |       - name: e2e test
18 |         # https://github.com/actions/virtual-environments/issues/709
19 |         run: |
20 |           echo "Need to add e2e test"
21 | 
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | bin/
 2 | run_device_plugin.sh
 3 | run_scheduler.sh
 4 | device_plugin.sh
 5 | libvgpu/build
 6 | updateso.sh
 7 | libvgpu.so
 8 | .idea
 9 | vendor
10 | license
11 | vgpuvalidator
12 | _output/
13 | coverage.out
14 | .DS_Store
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "libvgpu"]
2 | 	path = libvgpu
3 | 	url = https://github.com/Project-HAMi/HAMi-core.git
4 | 	branch = main
5 | 


--------------------------------------------------------------------------------
/.golangci.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | run:
 3 |   concurrency: 4
 4 |   modules-download-mode: readonly
 5 | output:
 6 |   formats:
 7 |     text:
 8 |       path: stdout
 9 |       print-linter-name: true
10 |       print-issued-lines: true
11 |       colors: true 
12 | linters:
13 |   default: none
14 |   enable:
15 |     - asciicheck
16 |     - forcetypeassert
17 |     - godot
18 |     - misspell
19 |     - staticcheck
20 |   settings:
21 |     dupl:
22 |       threshold: 800
23 |     errcheck:
24 |       check-type-assertions: true
25 |       check-blank: true
26 |     errorlint:
27 |       errorf: true
28 |       asserts: true
29 |       comparison: true
30 |     goconst:
31 |       min-len: 3
32 |       min-occurrences: 3
33 |     gocritic:
34 |       disabled-checks:
35 |         - commentedOutCode
36 |         - whyNoLint
37 |       enabled-tags:
38 |         - diagnostic
39 |         - experimental
40 |         - opinionated
41 |         - performance
42 |         - style
43 |       settings:
44 |         hugeParam:
45 |           sizeThreshold: 80
46 |         rangeExprCopy:
47 |           sizeThreshold: 512
48 |         rangeValCopy:
49 |           sizeThreshold: 128
50 |     gocyclo:
51 |       min-complexity: 20
52 |     godot:
53 |       scope: declarations
54 |       capital: false
55 |     nestif:
56 |       min-complexity: 20
57 |   exclusions:
58 |     generated: lax
59 |     presets:
60 |       - comments
61 |       - common-false-positives
62 |       - legacy
63 |       - std-error-handling
64 |     paths:
65 |       - third_party$
66 |       - builtin$
67 |       - examples$
68 |       - pkg/device-plugin 
69 | issues:
70 |   uniq-by-line: true
71 | formatters:
72 |   enable:
73 |     - gofmt
74 |     - goimports
75 |   settings:
76 |     gofmt:
77 |       simplify: true
78 |     gofumpt:
79 |       extra-rules: true
80 |     goimports:
81 |       local-prefixes:
82 |         - github.com/Project-HAMi/HAMi
83 |   exclusions:
84 |     generated: lax
85 |     paths:
86 |       - third_party$
87 |       - builtin$
88 |       - examples$
89 | 


--------------------------------------------------------------------------------
/.trivyignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/.trivyignore


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | - Please check [HAMi Community Membership](https://github.com/Project-HAMi/community/blob/main/community-membership.md) to find how to be a contributor.
 4 | - Here is the full list of the [MAINTAINERS](./MAINTAINERS.md).
 5 | 
 6 | The following people, in alphabetical order, have either authored or signed off on commits in the HAMi repository:
 7 | 
 8 | 
 9 | | Contributor | Email |
10 | |-----------------|-----------|
11 | | [archlitchi](https://github.com/archlitchi) | archlitchi@gmail.com|
12 | | [atttx123](https://github.com/atttx123) | - |
13 | | [chaunceyjiang](https://github.com/chaunceyjiang) | chaunceyjiang@gmail.com|
14 | | [CoderTH](https://github.com/CoderTH) | - |
15 | | [gsakun](https://github.com/gsakun) | - |
16 | | [lengrongfu](https://github.com/lengrongfu) | - |
17 | | [ouyangluwei](https://github.com/ouyangluwei163) | ouyangluwei@riseunion.io |
18 | | peizhaoyou | peizhaoyou@4paradigm.com |
19 | | [wawa0210](https://github.com/wawa0210) | xiaozhang0210@hotmail.com |
20 | | [whybeyoung](https://github.com/whybeyoung) | - |
21 | | [yinyu](https://github.com/Nimbus318) | nimbus-nimo@proton.me |
22 | | [yangshiqi](https://github.com/yangshiqi) | yangshiqi@riseunion.io |
23 | | zhengbingxian | - |
24 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # HAMi Community Code of Conduct
2 | 
3 | Please refer to our [HAMi Community Code of Conduct](https://github.com/Project-HAMi/community/blob/main/CODE-OF-CONDUCT.md).
4 | 


--------------------------------------------------------------------------------
/DEPENDENCY.md:
--------------------------------------------------------------------------------
 1 | # Environment Dependencies Policy
 2 | 
 3 | ## Purpose
 4 | 
 5 | This policy establishes guidelines for managing third-party packages in the HAMi repository. Its goal is to ensure that all dependencies are secure, up-to-date, and necessary for the project’s functionality.
 6 | 
 7 | ## Scope
 8 | 
 9 | This policy applies to all maintainers of the HAMi repository and governs all third-party packages incorporated into the project.
10 | 
11 | ## Policy
12 | 
13 | Maintainers must adhere to the following when incorporating third-party packages:
14 | 
15 | - **Necessity:** Include only those packages that are essential to the project’s functionality.
16 | - **Latest Stable Versions:** Use the latest stable releases whenever possible.
17 | - **Security:** Avoid packages with known security vulnerabilities.
18 | - **Version Pinning:** Lock all dependencies to specific versions to maintain consistency.
19 | - **Dependency Management:** Utilize an appropriate dependency management tool (e.g., Go modules, npm, pip) to handle third-party packages.
20 | - **Testing:** Ensure that any new dependency passes all automated tests before integration.
21 | 
22 | ## Procedure
23 | 
24 | When adding a new third-party package, maintainers should:
25 | 
26 | 1. **Assess Need:** Determine whether the package is truly necessary for the project.
27 | 2. **Conduct Research:** Review the package’s maintenance status and reputation within the community.
28 | 3. **Select Version:** Opt for the latest stable version that meets the project’s requirements.
29 | 4. **Pin the Version:** Explicitly pin the dependency to the chosen version within the repository.
30 | 5. **Update Documentation:** Revise the project documentation to include details about the new dependency.
31 | 
32 | ## Archive/Deprecation
33 | 
34 | If a third-party package becomes deprecated or discontinued, maintainers must promptly identify and integrate a suitable alternative while updating the documentation accordingly.
35 | 
36 | ## Enforcement
37 | 
38 | Compliance with this policy is monitored by the HAMi maintainers. All dependency-related changes are subject to peer review to ensure adherence to these guidelines.
39 | 
40 | ## Exceptions
41 | 
42 | Exceptions to this policy may be granted by the HAMi project lead on a case-by-case basis. Any exceptions must be documented with a clear rationale.
43 | 
44 | ## Credits
45 | 
46 | This policy has been adapted and optimized based on guidelines from the [Kubescape Community](https://github.com/kubescape/kubescape/blob/master/docs/environment-dependencies-policy.md).


--------------------------------------------------------------------------------
/HAMi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/HAMi.jpg


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
 1 | # Maintainers
 2 | 
 3 | - Please check [HAMi Community Membership](https://github.com/Project-HAMi/community/blob/main/community-membership.md) to find how to level up through the project.
 4 | - Please see [Contributors](./AUTHORS.md) for the full list of contributors to the project.
 5 | 
 6 | ## HAMi Committers
 7 | 
 8 | | Maintainer                                        | Email | Employer |
 9 | |---------------------------------------------------|-----------|-----------|
10 | | [Li Mengxuan](https://github.com/archlitchi)      | archlitchi@gmail.com | [dynamia.ai](https://www.dynamia.ai/) |
11 | | [Xiao Zhang](https://github.com/wawa0210)         | xiaozhang0210@hotmail.com | [dynamia.ai](https://www.dynamia.ai/) |
12 | | [Wang Leibo](https://github.com/william-wang)     | wang.platform@gmail.com | [HuaweiCloud](https://www.huaweicloud.com/) |
13 | | [Yin Yu](https://github.com/Nimbus318)     | nimbus-nimo@proton.me | Independent Developer |
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ##### Global variables #####
 2 | include version.mk Makefile.defs
 3 | 
 4 | all: build
 5 | 
 6 | docker:
 7 | 	docker build \
 8 | 	--build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \
 9 | 	--build-arg TARGET_ARCH=${TARGET_ARCH} \
10 | 	--build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \
11 | 	--build-arg DEST_DIR=${DEST_DIR} \
12 | 	--build-arg VERSION=${VERSION} \
13 | 	--build-arg GOPROXY=https://goproxy.cn,direct \
14 | 	. -f=docker/Dockerfile -t ${IMG_TAG}
15 | 
16 | dockerwithlib:
17 | 	docker build \
18 | 	--no-cache \
19 | 	--build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \
20 | 	--build-arg TARGET_ARCH=${TARGET_ARCH} \
21 | 	--build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \
22 | 	--build-arg DEST_DIR=${DEST_DIR} \
23 | 	--build-arg VERSION=${VERSION} \
24 | 	--build-arg GOPROXY=https://goproxy.cn,direct \
25 | 	. -f=docker/Dockerfile.withlib -t ${IMG_TAG}
26 | 
27 | tidy:
28 | 	$(GO) mod tidy
29 | 
30 | proto:
31 | 	$(GO) get github.com/gogo/protobuf/protoc-gen-gofast@v1.3.2
32 | 	protoc --gofast_out=plugins=grpc:. ./pkg/api/*.proto
33 | 
34 | build: $(CMDS) $(DEVICES)
35 | 
36 | $(CMDS):
37 | 	$(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@
38 | 
39 | $(DEVICES):
40 | 	$(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/info.version=$(VERSION)' -o ${OUTPUT_DIR}/$@-device-plugin ./cmd/device-plugin/$@
41 | 
42 | clean:
43 | 	$(GO) clean -r -x ./cmd/...
44 | 	-rm -rf $(OUTPUT_DIR)
45 | 
46 | .PHONY: all build docker clean test $(CMDS)
47 | 
48 | test:
49 | 	mkdir -p ./_output/coverage/
50 | 	bash hack/unit-test.sh
51 | 
52 | lint:
53 | 	bash hack/verify-staticcheck.sh
54 | 
55 | .PHONY: verify
56 | verify:
57 | 	hack/verify-all.sh
58 | 
59 | .PHONY: lint_dockerfile
60 | lint_dockerfile:
61 | 	@ docker run --rm \
62 |           -v $(ROOT_DIR)/.trivyignore:/.trivyignore \
63 |           -v /tmp/trivy:/root/trivy.cache/  \
64 |           -v $(ROOT_DIR):/tmp/src  \
65 |           aquasec/trivy:$(TRIVY_VERSION) config --exit-code 1  --severity $(LINT_TRIVY_SEVERITY_LEVEL) /tmp/src/docker  ; \
66 |       (($$?==0)) || { echo "error, failed to check dockerfile trivy" && exit 1 ; } ; \
67 |       echo "dockerfile trivy check: pass"
68 | 
69 | .PHONY: lint_chart
70 | lint_chart:
71 | 	@ docker run --rm \
72 |           -v $(ROOT_DIR)/.trivyignore:/.trivyignore \
73 |           -v /tmp/trivy:/root/trivy.cache/  \
74 |           -v $(ROOT_DIR):/tmp/src  \
75 |           aquasec/trivy:$(TRIVY_VERSION) config --exit-code 1  --severity $(LINT_TRIVY_SEVERITY_LEVEL) /tmp/src/charts  ; \
76 |       (($$?==0)) || { echo "error, failed to check chart trivy" && exit 1 ; } ; \
77 |       echo "chart trivy check: pass"
78 | 
79 | .PHONY: e2e-env-setup
80 | e2e-env-setup:
81 | 	./hack/e2e-test-setup.sh
82 | 
83 | .PHONY: helm-deploy
84 | helm-deploy:
85 | 	./hack/deploy-helm.sh "${E2E_TYPE}" "${KUBE_CONF}" "${HAMI_VERSION}"
86 | 
87 | .PHONY: e2e-test
88 | e2e-test:
89 | 	./hack/e2e-test.sh "${E2E_TYPE}" "${KUBE_CONF}"
90 | 


--------------------------------------------------------------------------------
/Makefile.defs:
--------------------------------------------------------------------------------
 1 | 
 2 | SHELL := /bin/bash
 3 | .SHELLFLAGS := -eu -o pipefail -c
 4 | 
 5 | ROOT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 6 | 
 7 | INSTALL = install
 8 | 
 9 | PREFIX?=/usr
10 | BINDIR?=$(PREFIX)/bin
11 | TARGETARCH ?= amd64
12 | 
13 | DESTDIR_BIN ?= $(ROOT_DIR)/output/$(TARGETARCH)/bin
14 | DESTDIR_BASH_COMPLETION ?= $(ROOT_DIR)/output/$(TARGETARCH)/bash-completion
15 | 
16 | VERSION?=""
17 | ifeq ($(VERSION), "")
18 |     VERSION=$(shell cat $(dir $(lastword $(MAKEFILE_LIST)))/VERSION)
19 | endif
20 | 
21 | ECHO_GEN=echo "  GEN   $(RELATIVE_DIR)/"
22 | 
23 | LINT_TRIVY_SEVERITY_LEVEL ?= CRITICAL
24 | TRIVY_VERSION=0.36.0
25 | 
26 | .PHONY: print-version
27 | print-version:
28 | 	@echo $(VERSION)
29 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | HAMi(https://project-hami.io/)
 2 | Copyright HAMi Contributors
 3 | 
 4 | This product includes software developed by
 5 | NVIDIA CORPORATION (https://www.nvidia.com).
 6 | Copyright (c) NVIDIA CORPORATION. All rights reserved.
 7 | 
 8 | This product includes software developed by
 9 | The HAMi Authors.
10 | Copyright 2024 The HAMi Authors.
11 | 
12 | Both are licensed under the Apache License, Version 2.0.
13 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | reviewers:
2 |   - archlitchi
3 |   - wawa0210
4 |   - chaunceyjiang
5 |   - lengrongfu
6 | approvers:
7 |   - archlitchi
8 |   - wawa0210
9 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | The following table outlines which versions of HAMi receive security updates:
 6 | 
 7 | | Version | Supported          |
 8 | |---------|--------------------|
 9 | | 2.5.x   | ✅ Security fixes |
10 | | 2.4.x   | ✅ Security fixes |
11 | | before 2.4.0   | ❌ No longer supported |
12 | 
13 | ## Reporting a Vulnerability
14 | 
15 | If you discover a security vulnerability in HAMi, we strongly encourage you to report it responsibly. Please **do not** disclose security vulnerabilities publicly without following our responsible disclosure process.
16 | 
17 | ### How to Report
18 | - **GitHub Security Advisories**: [submit a private vulnerability report via GitHub](https://github.com/Project-HAMi/HAMi/security/advisories/new).
19 | - **Bug Bounty**: Currently, HAMi does not offer a public bug bounty program.
20 | 
21 | ### Information to Include
22 | When reporting a security issue, please include:
23 | - A clear and concise description of the vulnerability.
24 | - Steps to reproduce the issue.
25 | - Any potential attack scenarios or security impact.
26 | - Suggested mitigations or fixes, if available.
27 | 
28 | ## Response Process
29 | 
30 | We follow a structured process to handle security reports:
31 | 
32 | Response times could be affected by weekends, holidays, breaks or time zone differences. That said, the maintainers will endeavour to reply as soon as possible, ideally within 5 working days.
33 | 
34 | 
35 | ## Third-Party Dependencies
36 | 
37 | HAMi relies on third-party libraries and containers. We monitor dependencies and promptly apply security patches.
38 | 
39 | 
40 | Thank you for helping us make HAMi more secure! 🔒


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | v2.5.0
2 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking the vGPU scheduler
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | ### how to build the benchmark image
 6 | 
 7 | ```bash
 8 | cd HAMi/benchmarks/ai-benchmark
 9 | 
10 | sh build.sh
11 | ```
12 | 
13 | ## How to install the official nvidia device plugin
14 | 
15 | Please refer to  [Quick Start](https://github.com/NVIDIA/k8s-device-plugin?tab=readme-ov-file#quick-start) in the official nvidia device plugin repository.
16 | 
17 | ## Run the benchmark
18 | 
19 | ```bash
20 | cd HAMi/benchmarks/deployments
21 | 
22 | kubectl apply -f job-on-hami.yml
23 | 
24 | kubectl apply -f job-on-nvidia-device-plugin.yml
25 | ```


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This Dockerfile is used to build a Docker image for running the AI Benchmark.
 2 | # It is based on the tensorflow/tensorflow:latest-gpu image.
 3 | 
 4 | FROM tensorflow/tensorflow:latest-gpu
 5 | 
 6 | # Set the working directory to /ai-benchmark
 7 | WORKDIR ai-benchmark
 8 | 
 9 | # Update the package list and install git and apt-utils
10 | RUN apt-get update && \
11 |     apt-get install -y --no-install-recommends apt-utils git && \
12 |     rm -rf /var/lib/apt/lists/* && \
13 |     pip install --no-cache-dir --upgrade pip && \
14 |     git clone https://github.com/Project-HAMi/ai-benchmark . && \
15 |     pip install --no-cache-dir -r requirements.txt
16 | 
17 | # Set the default command to run when the container starts
18 | CMD ["python", "./main.py"]


--------------------------------------------------------------------------------
/benchmarks/ai-benchmark/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | IMAGE="vgpu-benchmark"
 5 | TAG="v0.0.1"
 6 | PLATFORM="linux/amd64"
 7 | 
 8 | docker buildx build --push \
 9 |   --platform $PLATFORM \
10 |   --no-cache \
11 |   -t "$IMAGE:$TAG" \
12 |   -f Dockerfile .


--------------------------------------------------------------------------------
/benchmarks/deployments/job-on-hami.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: ai-benchmark-on-hami
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       name: ai-benchmark-on-hami
 9 |     spec:
10 |       containers:
11 |         - name: ai-benchmark-on-hami
12 |           image: 4pdosc/ai-benchmark:2.4.1-gpu
13 |           resources:
14 |             requests:
15 |               nvidia.com/gpu: 1
16 |               nvidia.com/gpumem-percentage: 50
17 |             limits:
18 |               nvidia.com/gpu: 1
19 |               nvidia.com/gpumem-percentage: 50
20 |       restartPolicy: Never


--------------------------------------------------------------------------------
/benchmarks/deployments/job-on-nvidia-device-plugin.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: ai-benchmark-on-official
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       name: ai-benchmark-on-official
 9 |     spec:
10 |       containers:
11 |         - name: ai-benchmark-on-official
12 |           image: 4pdosc/ai-benchmark:2.4.1-gpu
13 |           resources:
14 |             requests:
15 |               nvidia.com/gpu: 1
16 |             limits:
17 |               nvidia.com/gpu: 1
18 |       restartPolicy: Never


--------------------------------------------------------------------------------
/charts/Makefile:
--------------------------------------------------------------------------------
 1 | # get VERSION
 2 | .DEFAULT_GOAL := all
 3 | include  ../Makefile.defs
 4 | 
 5 | VERSION_REGEX := '[vV]*[0-9]\+\.[0-9]\+\.[0-9]\+.*'
 6 | CHART_FILE := "./hami/Chart.yaml"
 7 | VALUES_FILE := "./hami/values.yaml"
 8 | 
 9 | .PHONY: all lint update-versions
10 | all: update-versions lint package
11 | 
12 | #update version in chart
13 | update-versions:
14 | 	$(ECHO_GEN) " Updating Chart version to $(VERSION)"
15 | 	echo "VERSION=$(VERSION)"
16 | 	echo "VERSION_MAJOR=$(VERSION_MAJOR)"
17 | 	echo "GIT_VERSION=$(GIT_VERSION)"
18 | 	echo "FULL_BUILD_VERSION=$(FULL_BUILD_VERSION)"
19 | 	@# Update chart versions to point to the current version.
20 | 	hami_version="$(VERSION)";		\
21 | 	chart_version=` echo $(VERSION) | tr -d 'v' ` ; \
22 | 	sed -i 's/version: "*'$(VERSION_REGEX)'"*/version: '$$chart_version'/g' $(CHART_FILE);		\
23 | 	sed -i 's/appVersion: "*'$(VERSION_REGEX)'"*/appVersion: "'$$chart_version'"/g' $(CHART_FILE);	\
24 | 	sed -i 's/version: "*'$(VERSION_REGEX)'"*/version: "'$$hami_version'"/g' $(VALUES_FILE)
25 | 
26 | lint: update-versions
27 | 	helm lint --with-subcharts --values ./hami/values.yaml ./hami --debug
28 | 
29 | package: lint
30 | 	helm package ./hami --debug
31 | 
32 | clean:
33 | 	rm -f *.tgz
34 | 
35 | 


--------------------------------------------------------------------------------
/charts/hami/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: hami
 3 | version: 2.5.0
 4 | kubeVersion: ">= 1.18.0-0"
 5 | description: Heterogeneous AI Computing Virtualization Middleware
 6 | keywords:
 7 |   - vgpu
 8 |   - gpu
 9 | type: application
10 | maintainers:
11 |   - name: limengxuan
12 |     email: archlitchi@gmail.com
13 |   - name: zhangxiao
14 |     email: xiaozhang0210@hotmail.com
15 | appVersion: "2.5.0"
16 | 
17 | 


--------------------------------------------------------------------------------
/charts/hami/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | ** Please be patient while the chart is being deployed **
2 | Resource name: {{ .Values.resourceName }}
3 | 
4 | 


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/configmap.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "hami-vgpu.device-plugin" . }}
 5 |   namespace: {{ include "hami-vgpu.namespace" . }}
 6 |   labels:
 7 |     app.kubernetes.io/component: hami-device-plugin
 8 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
 9 | data:
10 |   config.json: |
11 |     {
12 |         "nodeconfig": [
13 |             {
14 |                 "name": "m5-cloudinfra-online02",
15 |                 "operatingmode": "hami-core",
16 |                 "devicememoryscaling": 1.8,
17 |                 "devicesplitcount": 10,
18 |                 "migstrategy":"none",
19 |                 "filterdevices": {
20 |                   "uuid": [],
21 |                   "index": []
22 |                 }
23 |             }
24 |         ]
25 |     }


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/monitorrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name:  {{ include "hami-vgpu.device-plugin" . }}-monitor
 5 | rules:
 6 |   - apiGroups:
 7 |       - ""
 8 |     resources:
 9 |       - pods
10 |     verbs:
11 |       - get
12 |       - create
13 |       - watch
14 |       - list
15 |       - update
16 |       - patch
17 |   - apiGroups:
18 |       - ""
19 |     resources:
20 |       - nodes
21 |     verbs:
22 |       - get
23 |       - update
24 |       - list
25 |       - patch
26 |     
27 |     
28 | 


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/monitorrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: {{ include "hami-vgpu.device-plugin" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: "hami-device-plugin"
 7 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   #name: cluster-admin
12 |   name: {{ include "hami-vgpu.device-plugin" . }}-monitor
13 | subjects:
14 |   - kind: ServiceAccount
15 |     name: {{ include "hami-vgpu.device-plugin" . }}
16 |     namespace: {{ include "hami-vgpu.namespace" . }}
17 | 


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/monitorservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "hami-vgpu.device-plugin" . }}-monitor
 5 |   namespace: {{ include "hami-vgpu.namespace" . }}
 6 |   labels:
 7 |     app.kubernetes.io/component: hami-device-plugin
 8 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
 9 |     {{- if .Values.devicePlugin.service.labels }}  # Use devicePlugin instead of scheduler
10 |     {{ toYaml .Values.devicePlugin.service.labels | indent 4 }}
11 |     {{- end }}
12 |   {{- if .Values.devicePlugin.service.annotations }}  # Use devicePlugin instead of scheduler
13 |   annotations: {{ toYaml .Values.devicePlugin.service.annotations | nindent 4 }}
14 |   {{- end }}
15 | spec:
16 |   type: {{ .Values.devicePlugin.service.type | default "NodePort" }}  # Default type is NodePort
17 |   ports:
18 |     - name: monitorport
19 |       port: {{ .Values.devicePlugin.service.httpPort | default 31992 }}  # Default HTTP port is 31992
20 |       targetPort: 9394
21 |       {{- if eq (.Values.devicePlugin.service.type | default "NodePort") "NodePort" }}  # If type is NodePort, set nodePort
22 |       nodePort: {{ .Values.devicePlugin.service.httpPort | default 31992 }}
23 |       {{- end }}
24 |       protocol: TCP
25 |   selector:
26 |     app.kubernetes.io/component: hami-device-plugin
27 |     {{- include "hami-vgpu.selectorLabels" . | nindent 4 }}


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/monitorserviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: {{ include "hami-vgpu.device-plugin" . }}
5 |   namespace: {{ include "hami-vgpu.namespace" . }}
6 |   labels:
7 |     app.kubernetes.io/component: "hami-device-plugin"
8 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
9 | 


--------------------------------------------------------------------------------
/charts/hami/templates/device-plugin/runtime-class.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }}
 2 | apiVersion: node.k8s.io/v1
 3 | kind: RuntimeClass
 4 | metadata:
 5 |   name: {{ .Values.devicePlugin.runtimeClassName }}
 6 |   annotations:
 7 |     helm.sh/hook: pre-install,pre-upgrade
 8 | handler: nvidia
 9 | {{- end }}
10 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/certmanager.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.scheduler.certManager.enabled }}
 2 | apiVersion: cert-manager.io/v1
 3 | kind: Certificate
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.scheduler" . }}-serving-cert
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   labels:
 8 |     app.kubernetes.io/component: hami-scheduler
 9 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
10 | spec:
11 |   dnsNames:
12 |     - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc
13 |     - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local
14 |   issuerRef:
15 |     kind: Issuer
16 |     name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
17 |   secretName: {{ include "hami-vgpu.scheduler.tls" . }}
18 | ---
19 | apiVersion: cert-manager.io/v1
20 | kind: Issuer
21 | metadata:
22 |   name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
23 |   namespace: {{ include "hami-vgpu.namespace" . }}
24 |   labels:
25 |     app.kubernetes.io/component: hami-scheduler
26 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
27 | spec:
28 |   selfSigned: {}
29 | {{- end }}
30 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/configmapnew.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.scheduler.kubeScheduler.enabled }}
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.scheduler" . }}-newversion
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   labels:
 8 |     app.kubernetes.io/component: hami-scheduler
 9 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
10 | data:
11 |   config.yaml: |
12 |     {{- if gt (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 25}}
13 |     apiVersion: kubescheduler.config.k8s.io/v1
14 |     {{- else }}
15 |     apiVersion: kubescheduler.config.k8s.io/v1beta2
16 |     {{- end }}
17 |     kind: KubeSchedulerConfiguration
18 |     leaderElection:
19 |       leaderElect: false
20 |     profiles:
21 |     - schedulerName: {{ .Values.schedulerName }}
22 |     extenders:
23 |     - urlPrefix: "https://127.0.0.1:443"
24 |       filterVerb: filter
25 |       bindVerb: bind
26 |       nodeCacheCapable: true
27 |       weight: 1
28 |       httpTimeout: 30s
29 |       enableHTTPS: true
30 |       tlsConfig:
31 |         insecure: true
32 |       managedResources:
33 |       - name: {{ .Values.resourceName }}
34 |         ignoredByScheduler: true
35 |       - name: {{ .Values.resourceMem }}
36 |         ignoredByScheduler: true
37 |       - name: {{ .Values.resourceCores }}
38 |         ignoredByScheduler: true
39 |       - name: {{ .Values.resourceMemPercentage }}
40 |         ignoredByScheduler: true
41 |       - name: {{ .Values.resourcePriority }}
42 |         ignoredByScheduler: true
43 |       - name: {{ .Values.mluResourceName }}
44 |         ignoredByScheduler: true
45 |       - name: {{ .Values.dcuResourceName }}
46 |         ignoredByScheduler: true
47 |       - name: {{ .Values.dcuResourceMem }}
48 |         ignoredByScheduler: true
49 |       - name: {{ .Values.dcuResourceCores }}
50 |         ignoredByScheduler: true
51 |       - name: {{ .Values.iluvatarResourceName }}
52 |         ignoredByScheduler: true
53 |       - name: "metax-tech.com/gpu"
54 |         ignoredByScheduler: true
55 |       - name: {{ .Values.metaxResourceName }}
56 |         ignoredByScheduler: true
57 |       - name: {{ .Values.metaxResourceCore }}
58 |         ignoredByScheduler: true
59 |       - name: {{ .Values.metaxResourceMem }}
60 |         ignoredByScheduler: true
61 |       {{- if .Values.devices.ascend.enabled }}
62 |       {{- range .Values.devices.ascend.customresources }}
63 |       - name: {{ . }}
64 |         ignoredByScheduler: true
65 |       {{- end }}
66 |       {{- end }}
67 |       {{- if .Values.devices.mthreads.enabled }}
68 |       {{- range .Values.devices.mthreads.customresources }}
69 |       - name: {{ . }}
70 |         ignoredByScheduler: true
71 |       {{- end }}
72 |       {{- end }}
73 |       {{- if .Values.devices.enflame.enabled }}
74 |       {{- range .Values.devices.enflame.customresources }}
75 |       - name: {{ . }}
76 |         ignoredByScheduler: true
77 |       {{- end }}
78 |       {{- end }}
79 | {{- end }}
80 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.fullname" . }}-admission
 6 |   annotations:
 7 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 8 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 9 |   labels:
10 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
11 |     app.kubernetes.io/component: admission-webhook
12 | rules:
13 |   - apiGroups:
14 |       - admissionregistration.k8s.io
15 |     resources:
16 |       #- validatingwebhookconfigurations
17 |       - mutatingwebhookconfigurations
18 |     verbs:
19 |       - get
20 |       - update
21 | {{- if .Values.podSecurityPolicy.enabled }}
22 |   - apiGroups: ['extensions']
23 |     resources: ['podsecuritypolicies']
24 |     verbs:     ['use']
25 |     resourceNames:
26 |     - {{ include "hami-vgpu.fullname" . }}-admission
27 | {{- end }}
28 | {{- end }}


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name:  {{ include "hami-vgpu.fullname" . }}-admission
 6 |   annotations:
 7 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 8 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 9 |   labels:
10 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
11 |     app.kubernetes.io/component: admission-webhook
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: ClusterRole
15 |   name: {{ include "hami-vgpu.fullname" . }}-admission
16 | subjects:
17 |   - kind: ServiceAccount
18 |     name: {{ include "hami-vgpu.fullname" . }}-admission
19 |     namespace: {{ include "hami-vgpu.namespace" . }}
20 | {{- end }}
21 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: batch/v1
 3 | kind: Job
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.fullname" . }}-admission-patch
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   annotations:
 8 |     "helm.sh/hook": post-install,post-upgrade
 9 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
10 |   labels:
11 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
12 |     app.kubernetes.io/component: admission-webhook
13 | spec:
14 |   {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
15 |   # Alpha feature since k8s 1.12
16 |   ttlSecondsAfterFinished: 0
17 |   {{- end }}
18 |   template:
19 |     metadata:
20 |       name: {{ include "hami-vgpu.fullname" . }}-admission-patch
21 |       {{- if .Values.scheduler.patch.podAnnotations }}
22 |       annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
23 |       {{- end }}
24 |       labels:
25 |         {{- include "hami-vgpu.labels" . | nindent 8 }}
26 |         app.kubernetes.io/component: admission-webhook
27 |         hami.io/webhook: ignore
28 |     spec:
29 |       {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
30 |       {{- if .Values.scheduler.patch.priorityClassName }}
31 |       priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
32 |       {{- end }}
33 |       containers:
34 |         - name: patch
35 |           {{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }}
36 |           image: {{ .Values.scheduler.patch.imageNew }}
37 |           {{- else }}
38 |           image: {{ .Values.scheduler.patch.image }}
39 |           {{- end }}
40 |           imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
41 |           args:
42 |             - patch
43 |             - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
44 |             - --namespace={{ include "hami-vgpu.namespace" . }}
45 |             - --patch-validating=false
46 |             - --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
47 |       restartPolicy: OnFailure
48 |       serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
49 |       {{- if .Values.scheduler.patch.nodeSelector }}
50 |       nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
51 |       {{- end }}
52 |       {{- if .Values.scheduler.patch.tolerations }}
53 |       tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
54 |       {{- end }}
55 |       securityContext:
56 |         runAsNonRoot: true
57 |         runAsUser: {{ .Values.scheduler.patch.runAsUser }}
58 | {{- end }}
59 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/psp.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | {{- if .Values.podSecurityPolicy.enabled }}
 3 | apiVersion: policy/v1beta1
 4 | kind: PodSecurityPolicy
 5 | metadata:
 6 |   name: {{ include "hami-vgpu.fullname" . }}-admission
 7 |   annotations:
 8 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 9 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
10 |   labels:
11 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
12 |     app.kubernetes.io/component: admission-webhook
13 | spec:
14 |   allowPrivilegeEscalation: false
15 |   fsGroup:
16 |     ranges:
17 |     - max: 65535
18 |       min: 1
19 |     rule: MustRunAs
20 |   requiredDropCapabilities:
21 |   - ALL
22 |   runAsUser:
23 |     rule: MustRunAsNonRoot
24 |   seLinux:
25 |     rule: RunAsAny
26 |   supplementalGroups:
27 |     ranges:
28 |     - max: 65535
29 |       min: 1
30 |     rule: MustRunAs
31 |   volumes:
32 |   - configMap
33 |   - emptyDir
34 |   - projected
35 |   - secret
36 |   - downwardAPI
37 | {{- end }}
38 | {{- end }}
39 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/role.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   name:  {{ include "hami-vgpu.fullname" . }}-admission
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   annotations:
 8 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 9 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
10 |   labels:
11 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
12 |     app.kubernetes.io/component: admission-webhook
13 | rules:
14 |   - apiGroups:
15 |       - ""
16 |     resources:
17 |       - secrets
18 |     verbs:
19 |       - get
20 |       - create
21 | {{- end }}
22 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: RoleBinding
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.fullname" . }}-admission
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   annotations:
 8 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 9 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
10 |   labels:
11 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
12 |     app.kubernetes.io/component: admission-webhook
13 | roleRef:
14 |   apiGroup: rbac.authorization.k8s.io
15 |   kind: Role
16 |   name: {{ include "hami-vgpu.fullname" . }}-admission
17 | subjects:
18 |   - kind: ServiceAccount
19 |     name: {{ include "hami-vgpu.fullname" . }}-admission
20 |     namespace: {{ include "hami-vgpu.namespace" . }}
21 | {{- end }}
22 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/job-patch/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "hami-vgpu.fullname" . }}-admission
 6 |   namespace: {{ include "hami-vgpu.namespace" . }}
 7 |   annotations:
 8 |     "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
 9 |     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
10 |   labels:
11 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
12 |     app.kubernetes.io/component: admission-webhook
13 | {{- end }}
14 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: {{ include "hami-vgpu.scheduler" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: "hami-scheduler"
 7 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   name: cluster-admin
12 | subjects:
13 |   - kind: ServiceAccount
14 |     name: {{ include "hami-vgpu.scheduler" . }}
15 |     namespace: {{ include "hami-vgpu.namespace" . }}
16 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "hami-vgpu.scheduler" . }}
 5 |   namespace: {{ include "hami-vgpu.namespace" . }}
 6 |   labels:
 7 |     app.kubernetes.io/component: hami-scheduler
 8 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
 9 |     {{- if .Values.scheduler.service.labels }}
10 |     {{ toYaml .Values.scheduler.service.labels | indent 4 }}
11 |     {{- end }}
12 |   {{- if .Values.scheduler.service.annotations }}
13 |   annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
14 |   {{- end }}
15 | spec:
16 |   type: {{ .Values.scheduler.service.type | default "NodePort" }}  # Default type is NodePort
17 |   ports:
18 |     - name: http
19 |       port: {{ .Values.scheduler.service.httpPort | default 443 }}  # Default HTTP port is 443
20 |       targetPort: {{ .Values.scheduler.service.httpTargetPort | default 443 }}
21 |       {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }}  # If type is NodePort, set nodePort
22 |       nodePort: {{ .Values.scheduler.service.schedulerPort | default 31998 }}
23 |       {{- end }}
24 |       protocol: TCP
25 |     - name: monitor
26 |       port: {{ .Values.scheduler.service.monitorPort | default 31993 }}  # Default monitoring port is 31993
27 |       targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }}
28 |       {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }}  # If type is NodePort, set nodePort
29 |       nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }}
30 |       {{- end }}
31 |       protocol: TCP
32 |   selector:
33 |     app.kubernetes.io/component: hami-scheduler
34 |     {{- include "hami-vgpu.selectorLabels" . | nindent 4 }}


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: {{ include "hami-vgpu.scheduler" . }}
5 |   namespace: {{ include "hami-vgpu.namespace" . }}
6 |   labels:
7 |     app.kubernetes.io/component: "hami-scheduler"
8 |     {{- include "hami-vgpu.labels" . | nindent 4 }}
9 | 


--------------------------------------------------------------------------------
/charts/hami/templates/scheduler/webhook.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: admissionregistration.k8s.io/v1
 2 | kind: MutatingWebhookConfiguration
 3 | metadata:
 4 |   {{- if .Values.scheduler.certManager.enabled }}
 5 |   annotations:
 6 |     cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert
 7 |   {{- end }}
 8 |   name: {{ include "hami-vgpu.scheduler.webhook" . }}
 9 | webhooks:
10 |   - admissionReviewVersions:
11 |     - v1beta1
12 |     clientConfig:
13 |       {{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
14 |       url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}}
15 |       {{- else }}
16 |       service:
17 |         name: {{ include "hami-vgpu.scheduler" . }}
18 |         namespace: {{ include "hami-vgpu.namespace" . }}
19 |         path: /webhook
20 |         port: {{ .Values.scheduler.service.httpPort }}
21 |       {{- end }}
22 |     failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }}
23 |     matchPolicy: Equivalent
24 |     name: vgpu.hami.io
25 |     namespaceSelector:
26 |       matchExpressions:
27 |       - key: hami.io/webhook
28 |         operator: NotIn
29 |         values:
30 |         - ignore
31 |       {{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }}
32 |       - key: kubernetes.io/metadata.name
33 |         operator: NotIn
34 |         values:
35 |         {{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }}
36 |       {{- end }}
37 |     objectSelector:
38 |       matchExpressions:
39 |       - key: hami.io/webhook
40 |         operator: NotIn
41 |         values:
42 |         - ignore
43 |     reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }}
44 |     rules:
45 |       - apiGroups:
46 |           - ""
47 |         apiVersions:
48 |           - v1
49 |         operations:
50 |           - CREATE
51 |         resources:
52 |           - pods
53 |         scope: '*'
54 |     sideEffects: None
55 |     timeoutSeconds: 10
56 | 


--------------------------------------------------------------------------------
/cmd/device-plugin/nvidia/watchers.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"os"
21 | 	"os/signal"
22 | 
23 | 	"github.com/fsnotify/fsnotify"
24 | )
25 | 
26 | func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
27 | 	watcher, err := fsnotify.NewWatcher()
28 | 	if err != nil {
29 | 		return nil, err
30 | 	}
31 | 
32 | 	for _, f := range files {
33 | 		err = watcher.Add(f)
34 | 		if err != nil {
35 | 			watcher.Close()
36 | 			return nil, err
37 | 		}
38 | 	}
39 | 
40 | 	return watcher, nil
41 | }
42 | 
43 | func newOSWatcher(sigs ...os.Signal) chan os.Signal {
44 | 	sigChan := make(chan os.Signal, 1)
45 | 	signal.Notify(sigChan, sigs...)
46 | 
47 | 	return sigChan
48 | }
49 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto
17 | go build
18 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/noderpc/noderpc.proto:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 gRPC authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | syntax = "proto3";
16 | 
17 | option go_package = "gitlab.4pd.io/vGPUmonitor";
18 | option java_multiple_files = true;
19 | option java_package = "io.grpc.examples.helloworld";
20 | option java_outer_classname = "HelloWorldProto";
21 | 
22 | package pluginrpc;
23 | 
24 | // The greeting service definition.
25 | service NodeVGPUInfo {
26 |   // Sends a greeting
27 |   rpc GetNodeVGPU (GetNodeVGPURequest) returns (GetNodeVGPUReply) {}
28 | }
29 | 
30 | // The sharedProcs contains the sharedRegion
31 | message shrregProcSlotT {
32 | 	int32 pid = 1;
33 | 	repeated uint64 used = 2;
34 | 	int32 status = 3;
35 | }
36 | 
37 | // The sharedRegionT struct is the main struct for monitoring vgpu
38 | message sharedRegionT {
39 | 	int32 initializedFlag = 1;
40 | 	uint32 ownerPid = 2;
41 | 	uint32 sem = 3;
42 | 	repeated uint64 limit = 4;
43 | 	repeated uint64 sm_limit = 5;
44 | 	repeated shrregProcSlotT procs = 6;
45 | }
46 | 
47 | message podusage {
48 | 	string poduuid = 1;
49 | 	sharedRegionT podvgpuinfo = 2;
50 | }
51 | 
52 | // The request message containing the user's name.
53 | message GetNodeVGPURequest {
54 |  	string ctruuid = 1;
55 | }
56 | 
57 | // The response message containing the greetings
58 | message GetNodeVGPUReply {
59 | 	string nodeid = 1;
60 | 	repeated podusage nodevgpuinfo = 2;	
61 | }
62 | 


--------------------------------------------------------------------------------
/cmd/vGPUmonitor/validation.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | )
23 | 
24 | var requiredEnvVars = map[string]bool{
25 | 	"HOOK_PATH":     true,
26 | 	"OTHER_ENV_VAR": false,
27 | }
28 | 
29 | func ValidateEnvVars() error {
30 | 	for envVar, required := range requiredEnvVars {
31 | 		_, exists := os.LookupEnv(envVar)
32 | 		if required && !exists {
33 | 			return fmt.Errorf("required environment variable %s not set", envVar)
34 | 		}
35 | 	}
36 | 	return nil
37 | }
38 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_IMAGE=golang:1.22.5-bullseye
 2 | ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04
 3 | 
 4 | FROM $GOLANG_IMAGE AS build
 5 | FROM $GOLANG_IMAGE AS gobuild
 6 | ARG GOPROXY
 7 | ARG VERSION
 8 | ADD . /k8s-vgpu
 9 | #RUN --mount=type=cache,target=/go/pkg/mod \
10 | #    cd /k8s-vgpu && make all
11 | RUN cd /k8s-vgpu && make all VERSION=$VERSION
12 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0
13 | 
14 | FROM $NVIDIA_IMAGE AS nvbuild
15 | COPY ./libvgpu /libvgpu
16 | WORKDIR /libvgpu
17 | ENV DEBIAN_FRONTEND=noninteractive
18 | RUN apt-get -y update; apt-get -y install cmake
19 | RUN bash ./build.sh
20 | 
21 | FROM nvidia/cuda:12.6.3-base-ubi8
22 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so*
23 | ENV NVIDIA_DISABLE_REQUIRE="true"
24 | ENV NVIDIA_VISIBLE_DEVICES=all
25 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
26 | 
27 | ARG VERSION
28 | LABEL version="$VERSION"
29 | LABEL maintainer="info@dynamia.ai"
30 | COPY ./LICENSE /k8s-vgpu/LICENSE
31 | COPY --from=gobuild /k8s-vgpu/bin /k8s-vgpu/bin
32 | COPY --from=gobuild /go/bin/nvidia-mig-parted /k8s-vgpu/bin/
33 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
34 | COPY ./lib /k8s-vgpu/lib
35 | COPY --from=nvbuild /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION"
36 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh
37 | 
38 | ENV PATH="/k8s-vgpu/bin:${PATH}"
39 | ARG DEST_DIR
40 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh  $DEST_DIR"]
41 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.hamicore:
--------------------------------------------------------------------------------
 1 | ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04
 2 | 
 3 | FROM $NVIDIA_IMAGE AS nvbuild
 4 | COPY ./libvgpu /libvgpu
 5 | WORKDIR /libvgpu
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | RUN apt-get -y update; apt-get -y install cmake
 8 | RUN bash ./build.sh
 9 | 
10 | FROM nvidia/cuda:12.6.3-base-ubi8
11 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so*
12 | ENV NVIDIA_DISABLE_REQUIRE="true"
13 | ENV NVIDIA_VISIBLE_DEVICES=all
14 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
15 | 
16 | ARG VERSION
17 | LABEL version="$VERSION"
18 | LABEL maintainer="projecthami@dynamia.ai"
19 | COPY --from=nvbuild /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION"
20 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.hamimaster:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_IMAGE
 2 | ARG HAMICORE_IMAGE
 3 | FROM $GOLANG_IMAGE AS build
 4 | FROM $HAMICORE_IMAGE AS corebuild
 5 | 
 6 | FROM $GOLANG_IMAGE AS GOBUILD
 7 | ADD . /k8s-vgpu
 8 | ARG VERSION
 9 | RUN go env -w GO111MODULE=on
10 | RUN cd /k8s-vgpu && make all VERSION=$VERSION
11 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0
12 | 
13 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04
14 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so*
15 | ENV NVIDIA_DISABLE_REQUIRE="true"
16 | ENV NVIDIA_VISIBLE_DEVICES=all
17 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
18 | 
19 | ARG VERSION
20 | LABEL version="$VERSION"
21 | LABEL maintainer="opensource@4paradigm.com"
22 | COPY ./LICENSE /k8s-vgpu/LICENSE
23 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin
24 | COPY --from=GOBUILD /go/bin/nvidia-mig-parted /k8s-vgpu/bin/
25 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
26 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh
27 | COPY ./lib /k8s-vgpu/lib
28 | COPY --from=corebuild  /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION"
29 | 
30 | ENV PATH="/k8s-vgpu/bin:${PATH}"
31 | ARG DEST_DIR
32 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh  $DEST_DIR"]
33 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.withlib:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_IMAGE
 2 | ARG NVIDIA_IMAGE
 3 | FROM $GOLANG_IMAGE AS build
 4 | 
 5 | FROM $GOLANG_IMAGE AS GOBUILD
 6 | ADD . /k8s-vgpu
 7 | ARG GOPROXY=https://goproxy.cn,direct
 8 | ARG VERSION
 9 | RUN go env -w GO111MODULE=on
10 | RUN cd /k8s-vgpu && make all VERSION=$VERSION
11 | RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0
12 | 
13 | FROM nvidia/cuda:12.6.3-base-ubuntu22.04
14 | RUN rm -rf /usr/local/cuda-12.6/compat/libcuda.so*
15 | ENV NVIDIA_DISABLE_REQUIRE="true"
16 | ENV NVIDIA_VISIBLE_DEVICES=all
17 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
18 | 
19 | ARG VERSION
20 | LABEL version="$VERSION"
21 | LABEL maintainer="info@dynamia.ai"
22 | COPY ./LICENSE /k8s-vgpu/LICENSE
23 | COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin
24 | COPY --from=GOBUILD /go/bin/nvidia-mig-parted /k8s-vgpu/bin/
25 | COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
26 | COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh
27 | COPY ./lib /k8s-vgpu/lib
28 | COPY ./libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION"
29 | COPY ./license /k8s-vgpu/lib/nvidia/
30 | COPY ./vgpuvalidator /k8s-vgpu/lib/nvidia
31 | 
32 | ENV PATH="/k8s-vgpu/bin:${PATH}"
33 | ARG DEST_DIR
34 | ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh  $DEST_DIR"]
35 | 


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2024 The HAMi Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # if [ $1 == "device-plugin" ]; then
18 | # cp -f /k8s-vgpu/lib/* $DEST_DIR/vgpu
19 | # fi
20 | exec "$@"
21 | 


--------------------------------------------------------------------------------
/docker/vgpu-init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Check if the destination directory is provided as an argument
 4 | if [ -z "$1" ]; then
 5 |     echo "Usage: $0 <destination_directory>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Source directory
10 | SOURCE_DIR="/k8s-vgpu/lib/nvidia/"
11 | 
12 | # Destination directory from the argument
13 | DEST_DIR="$1"
14 | 
15 | 
16 | # Check if the destination directory exists, create it if it doesn't
17 | if [ ! -d "$DEST_DIR" ]; then
18 |     mkdir -p "$DEST_DIR"
19 | fi
20 | 
21 | # Traverse all files in the source directory
22 | find "$SOURCE_DIR" -type f | while read -r source_file; do
23 |     # Get the relative path of the source file
24 |     relative_path="${source_file#$SOURCE_DIR}"
25 | 
26 |     # Construct the destination file path
27 |     dest_file="$DEST_DIR$relative_path"
28 | 
29 |     # If the destination file doesn't exist, copy the source file
30 |     if [ ! -f "$dest_file" ]; then
31 |         # Create the parent directory of the destination file if it doesn't exist
32 |         mkdir -p "$(dirname "$dest_file")"
33 |         
34 |         # Copy the file from source to destination
35 |         cp "$source_file" "$dest_file"
36 |         echo "Copied: $source_file -> $dest_file"
37 |     else
38 |         # Compare MD5 values of source and destination files
39 |         source_md5=$(md5sum "$source_file" | cut -d ' ' -f 1)
40 |         dest_md5=$(md5sum "$dest_file" | cut -d ' ' -f 1)
41 | 
42 |         # If MD5 values are different, copy the file
43 |         if [ "$source_md5" != "$dest_md5" ]; then
44 |             cp "$source_file" "$dest_file"
45 |             echo "Copied: $source_file -> $dest_file"
46 |         else
47 |             echo "Skipped (same MD5): $source_file"
48 |         fi
49 |     fi
50 | done
51 | 


--------------------------------------------------------------------------------
/docs/CHANGELOG/CHANGELOG-0.0.0.md:
--------------------------------------------------------------------------------
 1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
 3 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
 4 | 
 5 | - [v0.0.0](#v000)
 6 |   - [Downloads for v0.0.0](#downloads-for-v000)
 7 |   - [Changelog since v0.0.0](#changelog-since-v000)
 8 |     - [Changes by Kind](#changes-by-kind)
 9 |       - [Bug Fixes](#bug-fixes)
10 |       - [Others](#others)
11 | 
12 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
13 | 
14 | # v0.0.0
15 | ## Downloads for v0.0.0
16 | 
17 | Download v0.0.0 in the [v0.0.0 release page](https://github.com/Project-HAMi/HAMi/releases/tag/v0.0.0).
18 | 
19 | ## Changelog since v0.0.0
20 | ### Changes by Kind
21 | #### Bug Fixes
22 | None.
23 | 
24 | ### Deprecation
25 | None.
26 | 
27 | #### Others
28 | None.
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/benchmark.md:
--------------------------------------------------------------------------------
 1 | ## Benchmarks
 2 | 
 3 | Three instances from ai-benchmark have been used to evaluate vGPU-device-plugin performance as follows:
 4 | 
 5 | | Test Environment | description                                              |
 6 | | ---------------- | :------------------------------------------------------: |
 7 | | Kubernetes version | v1.12.9                                                |
 8 | | Docker  version    | 18.09.1                                                |
 9 | | GPU Type           | Tesla V100                                             |
10 | | GPU Num            | 2                                                      |
11 | 
12 | | Test instance |                         description                         |
13 | | ------------- | :---------------------------------------------------------: |
14 | | nvidia-device-plugin      |               k8s + nvidia k8s-device-plugin                |
15 | | vGPU-device-plugin        | k8s + VGPU k8s-device-plugin，without virtual device memory |
16 | | vGPU-device-plugin(virtual device memory) |  k8s + VGPU k8s-device-plugin，with virtual device memory   |
17 | 
18 | Test Cases:
19 | 
20 | | test id |     case      |   type    |         params          |
21 | | ------- | :-----------: | :-------: | :---------------------: |
22 | | 1.1     | Resnet-V2-50  | inference |  batch=50,size=346*346  |
23 | | 1.2     | Resnet-V2-50  | training  |  batch=20,size=346*346  |
24 | | 2.1     | Resnet-V2-152 | inference |  batch=10,size=256*256  |
25 | | 2.2     | Resnet-V2-152 | training  |  batch=10,size=256*256  |
26 | | 3.1     |    VGG-16     | inference |  batch=20,size=224*224  |
27 | | 3.2     |    VGG-16     | training  |  batch=2,size=224*224   |
28 | | 4.1     |    DeepLab    | inference |  batch=2,size=512*512   |
29 | | 4.2     |    DeepLab    | training  |  batch=1,size=384*384   |
30 | | 5.1     |     LSTM      | inference | batch=100,size=1024*300 |
31 | | 5.2     |     LSTM      | training  | batch=10,size=1024*300  |
32 | 
33 | Test Result: ![img](../imgs/benchmark_inf.png)
34 | 
35 | ![img](../imgs/benchmark_train.png)
36 | 
37 | To reproduce:
38 | 
39 | 1. install k8s-vGPU-scheduler, and configure properly
40 | 2. run benchmark job
41 | 
42 | ```
43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml
44 | ```
45 | 
46 | 3. View the result by using kubctl logs
47 | 
48 | ```
49 | $ kubectl logs [pod id]


--------------------------------------------------------------------------------
/docs/benchmark_cn.md:
--------------------------------------------------------------------------------
 1 | ## 性能测试
 2 | 
 3 | 在测试报告中，我们一共在下面五种场景都执行了ai-benchmark 测试脚本，并汇总最终结果：
 4 | 
 5 | | 测试环境 | 环境描述                                              |
 6 | | ---------------- | :------------------------------------------------------: |
 7 | | Kubernetes version | v1.12.9                                                |
 8 | | Docker  version    | 18.09.1                                                |
 9 | | GPU Type           | Tesla V100                                             |
10 | | GPU Num            | 2                                                      |
11 | 
12 | | 测试名称 |                      测试用例                      |
13 | | -------- | :------------------------------------------------: |
14 | | Nvidia-device-plugin        |         k8s + nvidia官方k8s-device-plugin          |
15 | | vGPU-device-plugin        |      k8s + VGPU k8s-device-plugin，无虚拟显存      |
16 | | vGPU-device-plugin(virtual device memory)  | k8s + VGPU k8s-device-plugin，高负载，开启虚拟显存 |
17 | 
18 | 测试内容
19 | 
20 | | test id |     名称      |   类型    |          参数           |
21 | | ------- | :-----------: | :-------: | :---------------------: |
22 | | 1.1     | Resnet-V2-50  | inference |  batch=50,size=346*346  |
23 | | 1.2     | Resnet-V2-50  | training  |  batch=20,size=346*346  |
24 | | 2.1     | Resnet-V2-152 | inference |  batch=10,size=256*256  |
25 | | 2.2     | Resnet-V2-152 | training  |  batch=10,size=256*256  |
26 | | 3.1     |    VGG-16     | inference |  batch=20,size=224*224  |
27 | | 3.2     |    VGG-16     | training  |  batch=2,size=224*224   |
28 | | 4.1     |    DeepLab    | inference |  batch=2,size=512*512   |
29 | | 4.2     |    DeepLab    | training  |  batch=1,size=384*384   |
30 | | 5.1     |     LSTM      | inference | batch=100,size=1024*300 |
31 | | 5.2     |     LSTM      | training  | batch=10,size=1024*300  |
32 | 
33 | 测试结果： ![img](../imgs/benchmark_inf.png)
34 | 
35 | ![img](../imgs/benchmark_train.png)
36 | 
37 | 测试步骤：
38 | 
39 | 1. 安装nvidia-device-plugin，并配置相应的参数
40 | 2. 运行benchmark任务
41 | 
42 | ```
43 | $ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml
44 | ```
45 | 
46 | 3. 通过kubctl logs 查看结果
47 | 
48 | ```
49 | $ kubectl logs [pod id]
50 | ```


--------------------------------------------------------------------------------
/docs/cambricon-mlu-support_cn.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | 
 3 | 本组件支持复用寒武纪MLU设备，并为此提供以下几种与vGPU类似的复用功能，包括：
 4 | 
 5 | ***MLU 共享***: 每个任务可以只占用一部分显卡，多个任务可以共享一张显卡
 6 | 
 7 | ***可限制分配的显存大小***: 你现在可以用显存值（例如3000M）来分配MLU，本组件会确保任务使用的显存不会超过分配数值
 8 | 
 9 | ***可限制分配的算力大小***: 你现在可以用百分比来分配MLU的算力，本组件会确保任务使用的算力不会超过分配数值
10 | 
11 | ***指定MLU型号***：当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式，来选择使用或者不使用某些具体型号的MLU
12 | 
13 | ## 节点需求
14 | 
15 | * neuware-mlu370-driver > 5.10
16 | * cntoolkit > 2.5.3
17 | 
18 | ## 开启MLU复用
19 | 
20 | * 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/Project-HAMi/HAMi/blob/master/README_cn.md#kubernetes开启vgpu支持)
21 | 
22 | * 使用以下指令，为MLU节点打上label
23 | ```
24 | kubectl label node {mlu-node} mlu=on
25 | ```
26 | 
27 | * 从您的设备提供商处获取cambricon-device-plugin，并配置以下两个参数：
28 | 
29 | `mode=dynamic-smlu`, `min-dsmlu-unit=256`
30 | 
31 | 它们分别代表开启MLU复用功能，与设置最小可分配的内存单元为256M，您可以参考设备提供方的文档来获取更多的配置信息。
32 | 
33 | * 部署配置后的`cambricon-device-plugin`
34 | 
35 | ```
36 | kubectl apply -f cambricon-device-plugin-daemonset.yaml
37 | ```
38 | 
39 | 
40 | ## 运行MLU任务
41 | 
42 | ```yaml
43 | apiVersion: apps/v1
44 | kind: Deployment
45 | metadata:
46 |   name: binpack-1
47 |   labels:
48 |     app: binpack-1
49 | spec:
50 |   replicas: 1
51 |   selector:
52 |     matchLabels:
53 |       app: binpack-1
54 |   template:
55 |     metadata:
56 |       labels:
57 |         app: binpack-1
58 |     spec:
59 |       containers:
60 |         - name: c-1
61 |           image: ubuntu:18.04
62 |           command: ["sleep"]
63 |           args: ["100000"]
64 |           resources:
65 |             limits:
66 |               cambricon.com/vmlu: "1"
67 |               cambricon.com/mlu.smlu.vmemory: "20"
68 |               cambricon.com/mlu.smlu.vcore: "10"
69 | ```
70 | 
71 | ## 注意事项
72 | 
73 | 1. 在init container中无法使用MLU复用功能，否则该任务不会被调度
74 | 
75 | 2. 只有申请单MLU的任务可以指定显存`mlu.smlu.vmemory`和算力`mlu.smlu.vcore`的数值，若申请的MLU数量大于1，则所有申请的MLU都会被整卡分配 
76 | 


--------------------------------------------------------------------------------
/docs/develop/design.md:
--------------------------------------------------------------------------------
 1 | # Design
 2 | 
 3 | <img src="../../imgs/arch.png" width = "800" /> 
 4 | 
 5 | The architect of HAMi is shown in the figure above, It is organized in the form of "chart".
 6 | 
 7 | - MutatingWebhook
 8 | 
 9 | The MutatingWebhook checks the validity of each task, and set the "schedulerName" to "HAMi scheduler" if the resource requests have been recognized by HAMi
10 | If Not, the MutatingWebhook does nothing and pass this task to default-scheduler.
11 | 
12 | - Scheduler
13 | 
14 | HAMi support default kube-scheduler and volcano-scheduler, it implements an extender and register 'Filter' and 'Score' methods to deal with sharable devices.
15 | When a pod with sharable device request arrives, 'Filter' searches the cluster and returns a list of 'available' nodes. 'Score' scores each node 'Filter' returned, and pick the highest one to host the pod. It patches the schedule decision on corresponding pod annotations, for the detailed protocol, see [protocol.md](protocol.md)
16 | 
17 | - DevicePlugin
18 | 
19 | When the schedule decision is made, scheduler calls devicePlugin on that node to generate environment variables and mounts according to pod annotations.
20 | Please note that, the DP used here is a customized version, you need to install according to [README](../../README.md) document with that device. Most officaial DP will not fit in HAMi, and will result in unexpected behaviour
21 | 
22 | - InContainer Control
23 | 
24 | The implementation of in-container hard limit is different for diffent devices. For example, HAMi-Core is responsible for NVIDIA devices. libvgpu-control.so is responsible for iluvatar devices, etc. HAMi needs to pass the correct environment variables in order for it to operate.
25 | 
26 | <img src="./imgs/flowchart.jpeg" width = "600" /> 
27 | 
28 | In summary, The flowchart of pod is descirbed as the figure above.
29 | 


--------------------------------------------------------------------------------
/docs/develop/imgs/flowchart.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/flowchart.jpeg


--------------------------------------------------------------------------------
/docs/develop/imgs/gpu-scheduler-policy-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/gpu-scheduler-policy-demo.png


--------------------------------------------------------------------------------
/docs/develop/imgs/hami-dynamic-mig-procedure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/hami-dynamic-mig-procedure.png


--------------------------------------------------------------------------------
/docs/develop/imgs/hami-dynamic-mig-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/hami-dynamic-mig-structure.png


--------------------------------------------------------------------------------
/docs/develop/imgs/node-shceduler-policy-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/node-shceduler-policy-demo.png


--------------------------------------------------------------------------------
/docs/develop/imgs/offline_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/offline_validation.png


--------------------------------------------------------------------------------
/docs/develop/imgs/protocol_pod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/protocol_pod.png


--------------------------------------------------------------------------------
/docs/develop/imgs/protocol_register.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/protocol_register.png


--------------------------------------------------------------------------------
/docs/develop/imgs/scheduler-policy-story.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/develop/imgs/scheduler-policy-story.png


--------------------------------------------------------------------------------
/docs/develop/roadmap.md:
--------------------------------------------------------------------------------
 1 | # roadmap
 2 | 
 3 | Heterogeneous AI Computing device to support
 4 | 
 5 | | Production  | manufactor | Type        |MemoryIsolation | CoreIsolation | MultiCard support |
 6 | |-------------|------------|-------------|-----------|---------------|-------------------|
 7 | | GPU         | NVIDIA     | All         | ✅              | ✅            | ✅                |
 8 | | MLU         | Cambricon  | 370, 590    | ✅              | ✅            | ❌                |
 9 | | GCU         | Enflame    | S60         | ✅              | ✅            | ❌                |
10 | | DCU         | Hygon      | Z100, Z100L | ✅              | ✅            | ❌                |
11 | | Ascend      | Huawei     | 910B        | ✅              | ✅            | ❌                |
12 | | GPU         | iluvatar   | All         | ✅              | ✅            | ❌                |
13 | | DPU         | Teco       | Checking    | In progress     | In progress   | ❌                |
14 | 
15 | 
16 | - [ ] Support video codec processing
17 | - [ ] Support Multi-Instance GPUs (MIG)
18 | - [ ] Support Flexible scheduling policies
19 |   - [x] binpack
20 |   - [x] spread
21 |   - [ ] numa affinity
22 | - [ ] integrated gpu-operator
23 | - [ ] Rich observability support
24 | - [ ] DRA Support
25 | - [ ] Support Intel GPU device
26 | - [ ] Support AMD GPU device
27 | - [x] Support Enflame GCU device
28 | 


--------------------------------------------------------------------------------
/docs/hygon-dcu-support.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | **We now support hygon.com/dcu by implementing most device-sharing features as nvidia-GPU**, including:
 4 | 
 5 | ***DCU sharing***: Each task can allocate a portion of DCU instead of a whole DCU card, thus DCU can be shared among multiple tasks.
 6 | 
 7 | ***Device Memory Control***: DCUs can be allocated with certain device memory size on certain type(i.e Z100) and have made it that it does not exceed the boundary.
 8 | 
 9 | ***Device compute core limitation***: DCUs can be allocated with certain percentage of device core(i.e hygon.com/dcucores:60 indicate this container uses 60% compute cores of this device)
10 | 
11 | ***DCU Type Specification***: You can specify which type of DCU to use or to avoid for a certain task, by setting "hygon.com/use-dcutype" or "hygon.com/nouse-dcutype" annotations. 
12 | 
13 | ## Prerequisites
14 | 
15 | * dtk driver >= 24.04
16 | * hy-smi v1.6.0
17 | 
18 | ## Enabling DCU-sharing Support
19 | 
20 | * Deploy the dcu-vgpu-device-plugin [here](https://github.com/Project-HAMi/dcu-vgpu-device-plugin)
21 | 
22 | 
23 | ## Running DCU jobs
24 | 
25 | Hygon DCUs can now be requested by a container
26 | using the `hygon.com/dcunum` , `hygon.com/dcumem` and `hygon.com/dcucores` resource type:
27 | 
28 | ```yaml
29 | apiVersion: v1
30 | kind: Pod
31 | metadata:
32 |   name: alexnet-tf-gpu-pod-mem
33 |   labels:
34 |     purpose: demo-tf-amdgpu
35 | spec:
36 |   containers:
37 |     - name: alexnet-tf-gpu-container
38 |       image: pytorch:resnet50
39 |       workingDir: /root
40 |       command: ["sleep","infinity"]
41 |       resources:
42 |         limits:
43 |           hygon.com/dcunum: 1 # requesting a GPU
44 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
45 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
46 | 
47 | ```
48 | 
49 | ## Enable vDCU inside container
50 | 
51 | You need to enable vDCU inside container in order to use it.
52 | ```
53 | source /opt/hygondriver/env.sh
54 | ```
55 | 
56 | check if you have successfully enabled vDCU by using following command
57 | 
58 | ```
59 | hy-virtual -show-device-info
60 | ```
61 | 
62 | If you have an output like this, then you have successfully enabled vDCU inside container.
63 | 
64 | ```
65 | Device 0:
66 | 	Actual Device: 0
67 | 	Compute units: 60
68 | 	Global memory: 2097152000 bytes
69 | ```
70 | 
71 | Launch your DCU tasks like you usually do
72 | 
73 | ## Notes
74 | 
75 | 1. DCU-sharing in init container is not supported, pods with "hygon.com/dcumem" in init container will never be scheduled.
76 | 
77 | 2. Only one vdcu can be aquired per container. If you want to mount multiple dcu devices, then you shouldn't set `hygon.com/dcumem` or `hygon.com/dcucores`
78 | 


--------------------------------------------------------------------------------
/docs/hygon-dcu-support_cn.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | 
 3 | 本组件支持复用海光DCU设备，并为此提供以下几种与vGPU类似的复用功能，包括：
 4 | 
 5 | ***DCU 共享***: 每个任务可以只占用一部分显卡，多个任务可以共享一张显卡
 6 | 
 7 | ***可限制分配的显存大小***: 你现在可以用显存值（例如3000M）来分配DCU，本组件会确保任务使用的显存不会超过分配数值
 8 | 
 9 | ***可限制计算单元数量***: 你现在可以指定任务使用的算力比例（例如60即代表使用60%算力）来分配DCU，本组件会确保任务使用的算力不会超过分配数值
10 | 
11 | ***指定DCU型号***：当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式，来选择使用或者不使用某些具体型号的DCU
12 | 
13 | ## 节点需求
14 | 
15 | * dtk driver >= 24.04
16 | * hy-smi v1.6.0
17 | 
18 | ## 开启DCU复用
19 | 
20 | * 部署[dcu-vgpu-device-plugin](https://github.com/Project-HAMi/dcu-vgpu-device-plugin)
21 | 
22 | ## 运行DCU任务
23 | 
24 | ```yaml
25 | apiVersion: v1
26 | kind: Pod
27 | metadata:
28 |   name: alexnet-tf-gpu-pod-mem
29 |   labels:
30 |     purpose: demo-tf-amdgpu
31 | spec:
32 |   containers:
33 |     - name: alexnet-tf-gpu-container
34 |       image: pytorch:resnet50
35 |       workingDir: /root
36 |       command: ["sleep","infinity"]
37 |       resources:
38 |         limits:
39 |           hygon.com/dcunum: 1 # requesting a GPU
40 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
41 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
42 | 
43 | ```
44 | 
45 | ## 容器内开启虚拟DCU功能
46 | 
47 | 使用vDCU首先需要激活虚拟环境
48 | ```
49 | source /opt/hygondriver/env.sh
50 | ```
51 | 
52 | 随后，使用hdmcli指令查看虚拟设备是否已经激活
53 | ```
54 | hy-virtual -show-device-info
55 | ```
56 | 
57 | 若输出如下，则代表虚拟设备已经成功激活
58 | ```
59 | Device 0:
60 | 	Actual Device: 0
61 | 	Compute units: 60
62 | 	Global memory: 2097152000 bytes
63 | ```
64 | 
65 | 接下来正常启动DCU任务即可
66 | 
67 | ## 注意事项
68 | 
69 | 1. 在init container中无法使用DCU复用功能，否则该任务不会被调度
70 | 
71 | 2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备，则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段
72 | 


--------------------------------------------------------------------------------
/docs/mind-map/HAMI-VGPU-mind-map-Chinese.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-Chinese.png


--------------------------------------------------------------------------------
/docs/mind-map/HAMI-VGPU-mind-map-Chinese.xmind:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-Chinese.xmind


--------------------------------------------------------------------------------
/docs/mind-map/HAMI-VGPU-mind-map-English.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-English.png


--------------------------------------------------------------------------------
/docs/mind-map/HAMI-VGPU-mind-map-English.xmind:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/mind-map/HAMI-VGPU-mind-map-English.xmind


--------------------------------------------------------------------------------
/docs/mind-map/readme:
--------------------------------------------------------------------------------
 1 | - 根据交流群里各位大佬的交流梳理此份思维导图（尤其感谢 @意琦行 大佬梳理的博客）
 2 | - 英文版由 @隽戈 大佬提供
 3 | - 若有问题处，各位大佬可随时提出
 4 | 
 5 | -Based on the communication among the experts in the communication group, this mind map has been compiled (especially thanks to @意琦行 expert for organizing the blog)
 6 | -The English version is provided by @隽戈
 7 | -If there are any issues, anyone can raise them at any time
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/mthreads-support.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | **We now support mthreads.com/vgpu by implementing most device-sharing features as nvidia-GPU**, including:
 4 | 
 5 | ***GPU sharing***: Each task can allocate a portion of GPU instead of a whole GPU card, thus GPU can be shared among multiple tasks.
 6 | 
 7 | ***Device Memory Control***: GPUs can be allocated with certain device memory size on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary.
 8 | 
 9 | ***Device Core Control***: GPUs can be allocated with limited compute cores on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary.
10 | 
11 | ## Important Notes
12 | 
13 | 1. Device sharing for multi-cards is not supported.
14 | 
15 | 2. Only one mthreads device can be shared in a pod(even there are multiple containers).
16 | 
17 | 3. Support allocating exclusive mthreads GPU by specifying mthreads.com/vgpu only.
18 | 
19 | 4. These features are tested on MTT S4000
20 | 
21 | ## Prerequisites
22 | 
23 | * [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/)
24 | * driver version >= 1.2.0
25 | 
26 | ## Enabling GPU-sharing Support
27 | 
28 | * Deploy MT-CloudNative Toolkit on mthreads nodes (Please consult your device provider to aquire its package and document)
29 | 
30 | > **NOTICE:** *You can remove mt-mutating-webhook and mt-gpu-scheduler after installation(optional).*
31 | 
32 | * set the 'devices.mthreads.enabled = true' when installing hami
33 | 
34 | ```
35 | helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system
36 | ```
37 | 
38 | ## Running Mthreads jobs
39 | 
40 | Mthreads GPUs can now be requested by a container
41 | using the `mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core`  resource type:
42 | 
43 | ```yaml
44 | apiVersion: v1
45 | kind: Pod
46 | metadata:
47 |   name: gpushare-pod-default
48 | spec:
49 |   restartPolicy: OnFailure
50 |   containers:
51 |     - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc
52 |       imagePullPolicy: IfNotPresent
53 |       name: gpushare-pod-1
54 |       command: ["sleep"]
55 |       args: ["100000"]
56 |       resources:
57 |         limits:
58 |           mthreads.com/vgpu: 1
59 |           mthreads.com/sgpu-memory: 32
60 |           mthreads.com/sgpu-core: 8
61 | ```
62 | 
63 | > **NOTICE1:** *Each unit of sgpu-memory indicates 512M device memory*
64 | 
65 | > **NOTICE2:** *You can find more examples in [examples/mthreads folder](../examples/mthreads/)*
66 | 


--------------------------------------------------------------------------------
/docs/mthreads-support_cn.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | 
 3 | 本组件支持复用摩尔线程GPU设备，并为此提供以下几种与vGPU类似的复用功能，包括：
 4 | 
 5 | ***GPU 共享***: 每个任务可以只占用一部分显卡，多个任务可以共享一张显卡
 6 | 
 7 | ***可限制分配的显存大小***: 你现在可以用显存值（例如3000M）来分配MLU，本组件会确保任务使用的显存不会超过分配数值、
 8 | 
 9 | ***可限制分配的算力核组比例***: 你现在可以用算力核组数量（例如8个）来分配GPU，本组件会确保任务使用的显存不会超过分配数值
10 | 
11 | ## 注意事项
12 | 
13 | 1. 暂时不支持多卡切片，多卡任务只能分配整卡
14 | 
15 | 2. 一个pod只能使用一个GPU生成的切片，即使该pod中有多个容器
16 | 
17 | 3. 支持独占模式，只指定`mthreads.com/vgpu`即为独占申请
18 | 
19 | 4. 本特性目前只支持MTT S4000设备
20 | 
21 | ## 节点需求
22 | 
23 | * [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/)
24 | * 驱动版本 >= 1.2.0
25 | 
26 | ## 开启GPU复用
27 | 
28 | * 部署'gpu-manager'，天数智芯的GPU共享需要配合厂家提供的'MT-CloudNative Toolkit'一起使用，请联系设备提供方获取
29 | 
30 | > **注意:** *（可选），部署完之后，卸载掉mt-mutating-webhook与mt-scheduler组件，因为这部分功能将由HAMi调度器提供*
31 | 
32 | * 在安装HAMi时配置'devices.mthreads.enabled = true'参数
33 | 
34 | ```
35 | helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system
36 | ```
37 | 
38 | ## 运行GPU任务
39 | 
40 | 通过指定`mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core`这3个参数，可以确定容器申请的切片个数，对应的显存和算力核组
41 | 
42 | ```yaml
43 | apiVersion: v1
44 | kind: Pod
45 | metadata:
46 |   name: gpushare-pod-default
47 | spec:
48 |   restartPolicy: OnFailure
49 |   containers:
50 |     - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc
51 |       imagePullPolicy: IfNotPresent
52 |       name: gpushare-pod-1
53 |       command: ["sleep"]
54 |       args: ["100000"]
55 |       resources:
56 |         limits:
57 |           mthreads.com/vgpu: 1
58 |           mthreads.com/sgpu-memory: 32
59 |           mthreads.com/sgpu-core: 8
60 | ```
61 | 
62 | > **注意1:** *每一单位的sgpu-memory代表512M的显存.*
63 | 
64 | > **注意2:** *查看更多的[用例](../examples/mthreads/).*
65 | 


--------------------------------------------------------------------------------
/docs/offline-install.md:
--------------------------------------------------------------------------------
 1 | # Offline-install Maunal
 2 | 
 3 | For some cluster that don't have external web access, you can install HAMi by the following step:
 4 | 
 5 | 1. Refer to [README.md](../README.md) until step 'Install and Uninstall'
 6 | 
 7 | 2. pull the following images and save them into a '.tar' file, then move it into your cluster
 8 | 
 9 | Image list:
10 | ```
11 | projecthami/hami:{HAMi version} 
12 | docker.io/jettech/kube-webhook-certgen:v1.5.2
13 | liangjw/kube-webhook-certgen:v1.1.1
14 | registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version}
15 | ```
16 | 
17 | ```
18 | docker pull {iamge} && docker save {image_name} -o {image_name}.tar 
19 | ```
20 | 
21 | 3. Load these images using docker load, tag these images with your registry, and push them into your registry
22 | 
23 | ```
24 | docker load -i {HAMi_image}.tar
25 | docker tag projecthami/hami:{HAMi version} {your_inner_registry}/hami:{HAMi version} 
26 | docker push {your_inner_registry}/hami:{HAMi version}
27 | docker tag docker.io/jettech/kube-webhook-certgen:v1.5.2 {your inner_regisry}/kube-webhook-certgen:v1.5.2
28 | docker push {your inner_regisry}/kube-webhook-certgen:v1.5.2
29 | docker tag liangjw/kube-webhook-certgen:v1.1.1 {your_inner_registry}/kube-webhook-certgen:v1.1.1
30 | docker tag registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} {your_inner_registry}/kube-scheduler:{your kubernetes version}
31 | docker push {your_inner_registry}/kube-scheduler:{your kubernetes version}  
32 | ```
33 | 
34 | 4. Download the charts folder from [github](https://github.com/Project-HAMi/HAMi/tree/master/charts), place it into ${CHART_PATH} inside cluser, then edit the following fields in ${CHART_PATH}/hami/values.yaml. 
35 | 
36 | ```
37 | scheduler.kubeScheduler.image
38 | scheduler.extender.image
39 | scheduler.patch.image
40 | scheduler.patch.imageNew
41 | scheduler.devicePlugin.image
42 | scheduler.devicePlugin.monitorimage
43 | ```
44 | 
45 | 5. Execute the following command in your /root/HAMi/chart folder
46 | 
47 | ```
48 | helm install hami hami --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system
49 | ```
50 | 
51 | 6. Verify your installation
52 | 
53 | execute the following command
54 | ```
55 | kubectl get pods -n kube-system
56 | ```
57 | 
58 | If you can see both the 'device-plugin' and 'schduler' running, then HAMi is installed successfully, as the figure shown below:
59 | 
60 | <img src="./develop/imgs/offline_validation.png" width = "600" /> 
61 | 


--------------------------------------------------------------------------------
/docs/proposals/e2e_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/proposals/e2e_test.png


--------------------------------------------------------------------------------
/docs/proposals/gpu_utilization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/docs/proposals/gpu_utilization.png


--------------------------------------------------------------------------------
/example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   labels:
 5 |     kubernetes.io/metadata.name: gpu-test-workloads
 6 |     pod-security.kubernetes.io/enforce: privileged
 7 |   name: gpu-test-workloads
 8 | ---
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: cuda-sample-vector-add
13 |   namespace: gpu-test-workloads
14 |   labels:
15 |     app: cuda-sample-vector-add
16 | spec:
17 |   replicas: 1
18 |   selector:
19 |     matchLabels:
20 |       app: cuda-sample-vector-add
21 |   template:
22 |     metadata:
23 |       labels:
24 |         app: cuda-sample-vector-add
25 |     spec:
26 |       containers:
27 |         - name: cuda-sample-vector-add
28 |           image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
29 |           command:
30 |             - /bin/bash
31 |             - '-c'
32 |             - '--'
33 |           args:
34 |             - while true; do /cuda-samples/vectorAdd; done
35 |           resources:
36 |             limits:
37 |               nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
38 |               nvidia.com/gpumem: 3000 # Each vGPU contains 3000M device memory （Optional,Integer）
39 |           terminationMessagePath: /dev/termination-log
40 |           terminationMessagePolicy: File
41 |           imagePullPolicy: IfNotPresent
42 |       restartPolicy: Always
43 |       terminationGracePeriodSeconds: 30
44 |       dnsPolicy: ClusterFirst
45 |       hostPID: true
46 |       securityContext: {}
47 |       schedulerName: default-scheduler
48 |       tolerations:
49 |         - key: nvidia.com/gpu
50 |           operator: Exists
51 |           effect: NoSchedule
52 |       priorityClassName: system-cluster-critical
53 |   strategy:
54 |     type: RollingUpdate
55 |     rollingUpdate:
56 |       maxUnavailable: 25%
57 |       maxSurge: 25%
58 |   revisionHistoryLimit: 10
59 |   progressDeadlineSeconds: 600


--------------------------------------------------------------------------------
/examples/ascend/job-310P.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ascend310p-job
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           huawei.com/Ascend310P: 1 # requesting 1 NPU
13 |           huawei.com/Ascend310P-memory: 2000 # requesting 2000m device memory


--------------------------------------------------------------------------------
/examples/ascend/job-910A.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ascend910a-job
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           huawei.com/Ascend910A: 1 # requesting 1 NPU
13 |           huawei.com/Ascend910A-memory: 2000 # requesting 2000m device memory


--------------------------------------------------------------------------------
/examples/ascend/job-910B2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ascend910b2-job
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           huawei.com/Ascend910B2: 1 # requesting 1 NPU
13 |           huawei.com/Ascend910B2-memory: 2000 # requesting 2000m device memory


--------------------------------------------------------------------------------
/examples/ascend/job-910B3.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ascend910b-job
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           huawei.com/Ascend910B: 1 # requesting 1 NPU
13 |           huawei.com/Ascend910B-memory: 2000 # requesting 2000m device memory


--------------------------------------------------------------------------------
/examples/ascend/job-910B4.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ascend910b4-job
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           huawei.com/Ascend910B4: 1 # requesting 1 NPU
13 |           huawei.com/Ascend910B4-memory: 2000 # requesting 2000m device memory


--------------------------------------------------------------------------------
/examples/enflame/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gcushare-pod-2
 5 |   namespace: kube-system
 6 | spec:
 7 |   terminationGracePeriodSeconds: 0
 8 |   containers:
 9 |     - name: pod-gcu-example1
10 |       image: ubuntu:18.04
11 |       imagePullPolicy: IfNotPresent
12 |       command:
13 |         - sleep
14 |       args:
15 |         - '100000'
16 |       resources:
17 |         limits:
18 |           enflame.com/vgcu: 1
19 |           enflame.com/vgcu-percentage: 22


--------------------------------------------------------------------------------
/examples/enflame/use_exclusive.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gcushare-pod-4
 5 |   namespace: kube-system
 6 | spec:
 7 |   terminationGracePeriodSeconds: 0
 8 |   containers:
 9 |     - name: pod-gcu-example3
10 |       image: ubuntu:18.04
11 |       imagePullPolicy: IfNotPresent
12 |       command:
13 |         - sleep
14 |       args:
15 |         - '100000'
16 |       resources:
17 |         limits:
18 |           enflame.com/vgcu: 1


--------------------------------------------------------------------------------
/examples/hygon/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   labels:
 6 |     purpose: demo-tf-amdgpu
 7 | spec:
 8 |   containers:
 9 |     - name: alexnet-tf-gpu-container
10 |       image: pytorch:resnet50
11 |       workingDir: /root
12 |       command: ["sleep","infinity"]
13 |       resources:
14 |         limits:
15 |           hygon.com/dcunum: 1 # requesting a GPU
16 |           hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory
17 |           hygon.com/dcucores: 60 # each dcu use 60% of total compute cores
18 | 


--------------------------------------------------------------------------------
/examples/hygon/specify_card_type_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   annotations:
 6 |     hygon.com/nouse-dcutype: "Z100L" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we don't want this container to run on Z100L
 8 |     purpose: demo-tf-amdgpu
 9 | spec:
10 |   containers:
11 |     - name: alexnet-tf-gpu-container
12 |       image: pytorch:resnet50
13 |       workingDir: /root
14 |       command: ["sleep","infinity"]
15 |       resources:
16 |         limits:
17 |           hygon.com/dcunum: 1 # requesting a GPU
18 |           hygon.com/dcumem: 2000
19 |           hygon.com/dcucores: 60
20 | 


--------------------------------------------------------------------------------
/examples/hygon/specify_card_type_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: alexnet-tf-gpu-pod-mem
 5 |   annotations:
 6 |     hygon.com/use-dcutype: "Z100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card
 7 |     #In this example, we want to run this job on Z100
 8 |   labels:
 9 |     purpose: demo-tf-amdgpu
10 | spec:
11 |   containers:
12 |     - name: alexnet-tf-gpu-container
13 |       image: pytorch:resnet50
14 |       workingDir: /root
15 |       command: ["sleep","infinity"]
16 |       resources:
17 |         limits:
18 |           hygon.com/dcunum: 1 # requesting a GPU
19 |           hygon.com/dcumem: 2000
20 |           hygon.com/dcucores: 60
21 | 


--------------------------------------------------------------------------------
/examples/iluvatar/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: poddemo
 5 | spec:
 6 |   restartPolicy: Never
 7 |   containers:
 8 |   - name: poddemo
 9 |     image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e
10 |     command: 
11 |     - bash
12 |     args:
13 |     - -c
14 |     - |
15 |       set -ex
16 |       echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc
17 |       cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/
18 |       cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/
19 |       source /root/.bashrc
20 |       sleep 360000
21 |     resources:
22 |       requests:
23 |         iluvatar.ai/vgpu: 1
24 |         iluvatar.ai/vcuda-core: 50
25 |         iluvatar.ai/vcuda-memory: 64
26 |       limits:
27 |         iluvatar.ai/vgpu: 1
28 |         iluvatar.ai/vcuda-core: 50
29 |         iluvatar.ai/vcuda-memory: 64


--------------------------------------------------------------------------------
/examples/iluvatar/multi-containers.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: poddemo
 5 | spec:
 6 |   restartPolicy: Never
 7 |   containers:
 8 |   - name: poddemo
 9 |     image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e
10 |     command: 
11 |     - bash
12 |     args:
13 |     - -c
14 |     - |
15 |       set -ex
16 |       echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc
17 |       cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/
18 |       cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/
19 |       source /root/.bashrc
20 |       sleep 360000
21 |     resources:
22 |       requests:
23 |         iluvatar.ai/vgpu: 1
24 |         iluvatar.ai/vcuda-core: 50
25 |         iluvatar.ai/vcuda-memory: 64
26 |       limits:
27 |         iluvatar.ai/vgpu: 1
28 |         iluvatar.ai/vcuda-core: 50
29 |         iluvatar.ai/vcuda-memory: 64
30 |   - name: poddemo1
31 |     image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e
32 |     command:
33 |     - bash
34 |     args:
35 |     - -c
36 |     - |
37 |       set -ex
38 |       echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc
39 |       cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/
40 |       cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/
41 |       source /root/.bashrc
42 |       sleep 360000
43 |     resources:
44 |       requests:
45 |         iluvatar.ai/vgpu: 1
46 |         iluvatar.ai/vcuda-core: 50
47 |         iluvatar.ai/vcuda-memory: 64
48 |       limits:
49 |         iluvatar.ai/vgpu: 1
50 |         iluvatar.ai/vcuda-core: 50
51 |         iluvatar.ai/vcuda-memory: 64


--------------------------------------------------------------------------------
/examples/iluvatar/multi-devices.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: poddemo
 5 | spec:
 6 |   restartPolicy: Never
 7 |   containers:
 8 |   - name: poddemo
 9 |     image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e
10 |     command: 
11 |     - bash
12 |     args:
13 |     - -c
14 |     - |
15 |       set -ex
16 |       echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc
17 |       cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/
18 |       cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/
19 |       source /root/.bashrc
20 |       sleep 360000
21 |     resources:
22 |       requests:
23 |         iluvatar.ai/vgpu: 2
24 |       limits:
25 |         iluvatar.ai/vgpu: 2


--------------------------------------------------------------------------------
/examples/metax/gpu/binpack.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod1
 5 |   annotations: 
 6 |     hami.io/node-scheduler-policy: "binpack" # when this parameter is set to binpack, the scheduler will try to minimize the topology loss.
 7 | spec:
 8 |   containers:
 9 |     - name: ubuntu-container
10 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
11 |       imagePullPolicy: IfNotPresent
12 |       command: ["sleep","infinity"]
13 |       resources:
14 |         limits:
15 |           metax-tech.com/gpu: 1 # requesting 1 vGPUs


--------------------------------------------------------------------------------
/examples/metax/gpu/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod1
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
 9 |       imagePullPolicy: IfNotPresent
10 |       command: ["sleep","infinity"]
11 |       resources:
12 |         limits:
13 |           metax-tech.com/gpu: 1 # requesting 1 vGPUs


--------------------------------------------------------------------------------
/examples/metax/gpu/spread.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod1
 5 |   annotations: 
 6 |     hami.io/node-scheduler-policy: "spread" # when this parameter is set to spread, the scheduler will try to find the best topology for this task.
 7 | spec:
 8 |   containers:
 9 |     - name: ubuntu-container
10 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
11 |       imagePullPolicy: IfNotPresent
12 |       command: ["sleep","infinity"]
13 |       resources:
14 |         limits:
15 |           metax-tech.com/gpu: 1 # requesting 1 vGPUs


--------------------------------------------------------------------------------
/examples/metax/sgpu/allocate_exclusive.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
 9 |       imagePullPolicy: IfNotPresent
10 |       command: ["sleep","infinity"]
11 |       resources:
12 |         limits:
13 |           metax-tech.com/sgpu: 1 # requesting 1 exclusive GPU


--------------------------------------------------------------------------------
/examples/metax/sgpu/allocate_specific_gpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     metax-tech.com/use-gpuuuid: "36beae85-c835-6b14-6ab2-02671837a59c" # allocate specific gpu
 7 | spec:
 8 |   containers:
 9 |     - name: ubuntu-container
10 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
11 |       imagePullPolicy: IfNotPresent
12 |       command: ["sleep","infinity"]
13 |       resources:
14 |         limits:
15 |           metax-tech.com/sgpu: 1 # requesting 1 GPU
16 |           metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores
17 |           metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory


--------------------------------------------------------------------------------
/examples/metax/sgpu/allocate_vmemory_MiB.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
 9 |       imagePullPolicy: IfNotPresent
10 |       command: ["sleep","infinity"]
11 |       resources:
12 |         limits:
13 |           metax-tech.com/sgpu: 1 # requesting 1 GPU
14 |           metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores
15 |           metax-tech.com/vmemory: 2048Mi # each GPU require 2048 MiB device memory


--------------------------------------------------------------------------------
/examples/metax/sgpu/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
 9 |       imagePullPolicy: IfNotPresent
10 |       command: ["sleep","infinity"]
11 |       resources:
12 |         limits:
13 |           metax-tech.com/sgpu: 1 # requesting 1 GPU
14 |           metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores
15 |           metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory


--------------------------------------------------------------------------------
/examples/metax/sgpu/multi-containers.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container-1
 8 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
 9 |       imagePullPolicy: IfNotPresent
10 |       command: ["sleep","infinity"]
11 |       resources:
12 |         limits:
13 |           metax-tech.com/sgpu: 1 # requesting 1 GPU
14 |           metax-tech.com/vcore: 60 # each GPU use 60% of total compute cores
15 |           metax-tech.com/vmemory: 4 # each GPU require 4 GiB device memory
16 |     - name: ubuntu-container-2
17 |       image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 
18 |       imagePullPolicy: IfNotPresent
19 |       command: ["sleep","infinity"]
20 |       resources:
21 |         limits:
22 |           metax-tech.com/sgpu: 1 # requesting 1 GPU
23 |           metax-tech.com/vcore: 30 # each GPU use 30% of total compute cores
24 |           metax-tech.com/vmemory: 8 # each GPU require 8 GiB device memory


--------------------------------------------------------------------------------
/examples/mlu/allocate_whole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: binpack-1
 5 |   labels:
 6 |     app: binpack-1
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: binpack-1
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: binpack-1
16 |     spec:
17 |       containers:
18 |         - name: c-1
19 |           image: ubuntu:18.04
20 |           command: ["sleep"]
21 |           args: ["100000"]
22 |           resources:
23 |             limits:
24 |               cambricon.com/vmlu: "1" #allocates a whole MLU


--------------------------------------------------------------------------------
/examples/mlu/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: binpack-1
 5 |   labels:
 6 |     app: binpack-1
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: binpack-1
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: binpack-1
16 |     spec:
17 |       containers:
18 |         - name: c-1
19 |           image: ubuntu:18.04
20 |           command: ["sleep"]
21 |           args: ["100000"]
22 |           resources:
23 |             limits:
24 |               cambricon.com/vmlu: "1"
25 |               cambricon.com/mlu370.smlu.vmemory: "20"
26 |               cambricon.com/mlu370.smlu.vcore: "10"


--------------------------------------------------------------------------------
/examples/mthreads/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpushare-pod-default
 5 | spec:
 6 |   restartPolicy: OnFailure
 7 |   containers:
 8 |     - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 
 9 |       imagePullPolicy: IfNotPresent
10 |       name: gpushare-pod-1
11 |       command: ["sleep"]
12 |       args: ["100000"]
13 |       resources:
14 |         limits:
15 |           mthreads.com/vgpu: 1
16 |           mthreads.com/sgpu-memory: 32
17 |           mthreads.com/sgpu-core: 8


--------------------------------------------------------------------------------
/examples/mthreads/multi_cards.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpushare-pod-multi-cards
 5 | spec:
 6 |   restartPolicy: OnFailure
 7 |   containers:
 8 |     - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 
 9 |       imagePullPolicy: IfNotPresent
10 |       name: gpushare-pod-1
11 |       command: ["sleep"]
12 |       args: ["100000"]
13 |       resources:
14 |         limits:
15 |           mthreads.com/vgpu: 2


--------------------------------------------------------------------------------
/examples/mthreads/use_exclusive.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpushare-pod-exclusive
 5 | spec:
 6 |   restartPolicy: OnFailure
 7 |   containers:
 8 |     - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc 
 9 |       imagePullPolicy: IfNotPresent
10 |       name: gpushare-pod-1
11 |       command: ["sleep"]
12 |       args: ["100000"]
13 |       resources:
14 |         limits:
15 |           mthreads.com/vgpu: 1


--------------------------------------------------------------------------------
/examples/nvidia/default_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:22.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
13 |           nvidia.com/gpumem: 3000 # identifies 3000M GPU memory each physical GPU allocates to the pod （Optional,Integer）
14 |           nvidia.com/gpucores: 30 # identifies 30% GPU GPU core each physical GPU allocates to the pod （Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/examples/nvidia/default_use_legacy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
13 | 


--------------------------------------------------------------------------------
/examples/nvidia/dynamic_mig_example.yaml:
--------------------------------------------------------------------------------
 1 | ## This example will allocate 2g.10gb * 2 for A100-40GB-PCIE device 
 2 | ## or 1g.10gb * 2 for A100-80GB-XSM device.
 3 | apiVersion: v1
 4 | kind: Pod
 5 | metadata:
 6 |   name: gpu-pod
 7 |   annotations:
 8 |     nvidia.com/vgpu-mode: "mig"
 9 |     hami.io/gpu-scheduler-policy: "binpack" #(Optional)
10 | spec:
11 |   containers:
12 |     - name: ubuntu-container
13 |       image: ubuntu:18.04
14 |       command: ["bash", "-c", "sleep 86400"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/gpu: 2
18 |           nvidia.com/gpumem: 8000
19 | 
20 | 


--------------------------------------------------------------------------------
/examples/nvidia/example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
13 |           #nvidia.com/gpumem: 3000 # identifies 3000M GPU memory each physical GPU allocates to the pod
14 |           nvidia.com/gpumem-percentage: 50 # identifies 50% GPU memory each physical GPU allocates to the pod. Can not be used with nvidia.com/gpumem
15 |           #nvidia.com/gpucores: 90 # identifies 90% GPU GPU core each physical GPU allocates to the pod 
16 |           #nvidia.com/priority: 0 # we only have two priority class, 0(high) and 1(low), default: 1 
17 |           #The utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks.
18 |           #The utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU.
19 |     - name: ubuntu-container0
20 |       image: ubuntu:18.04
21 |       command: ["bash", "-c", "sleep 86400"]
22 |     - name: ubuntu-container1
23 |       image: ubuntu:18.04
24 |       command: ["bash", "-c", "sleep 86400"]
25 |       resources:
26 |         limits:
27 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
28 |           nvidia.com/gpumem: 2000 # identifies 2000M GPU memory each physical GPU allocates to the pod （Optional,Integer）
29 |           #nvidia.com/gpucores: 90 # identifies 90% GPU GPU core each physical GPU allocates to the pod
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/nvidia/mig_example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/mig-3g.20gb: 1 # requesting 1 vGPUs
13 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_card_type_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get registered gpu info
 7 |     # The full GPU type name is like NVIDIA-NVIDIA A100, while the short name is like A100
 8 |     nvidia.com/nouse-gputype: "1080,2080" # Specify the blacklist card type for this job, use comma to seperate, will not launch job on specified card
 9 |     # In this example, we don't want our job to run on 1080(include 1080Ti) or 2080(include 2080Ti) type of card.
10 | spec:
11 |   containers:
12 |     - name: ubuntu-container
13 |       image: ubuntu:18.04
14 |       command: ["bash", "-c", "sleep 86400"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
18 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_card_type_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get registered gpu info
 7 |     # The full GPU type name is like NVIDIA-NVIDIA A100, while the short name is like A100
 8 |     nvidia.com/use-gputype: "A100,V100" # Specify the card type for this job, use comma to seperate, will launch job on specified card
 9 |     # In this example, we want to run this job on A100 or V100
10 | spec:
11 |   containers:
12 |     - name: ubuntu-container
13 |       image: ubuntu:18.04
14 |       command: ["bash", "-c", "sleep 86400"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
18 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_scheduling_policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     hami.io/node-scheduler-policy: "spread" # when this parameter is set to spread, the scheduler will try to allocate the pod to different GPU nodes for execution.
 7 |     hami.io/gpu-scheduler-policy: "binpack" # when this parameter is set to binpack, the scheduler will try to allocate the pod to the same GPU card for execution.
 8 | spec:
 9 |   containers:
10 |     - name: ubuntu-container
11 |       image: ubuntu:18.04
12 |       command: ["bash", "-c", "sleep 86400"]
13 |       resources:
14 |         limits:
15 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
16 | 


--------------------------------------------------------------------------------
/examples/nvidia/specify_uuid_not_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get gpu-type
 7 |     # UUID is like GPU-03f69c50-207a-2038-9b45-23cac89cb67d
 8 |     nvidia.com/nouse-gpuuuid: "GPU-03f69c50-207a-2038-9b45-23cac89cb67d" # Specify the blacklist card UUIDs for this job, use comma to seperate, will not launch job on specified cards
 9 |     # In this job, we don't want our job to run on GPU-03f69c50-207a-2038-9b45-23cac89cb67d.
10 | spec:
11 |   containers:
12 |     - name: ubuntu-container
13 |       image: ubuntu:18.04
14 |       command: ["bash", "-c", "sleep 86400"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs


--------------------------------------------------------------------------------
/examples/nvidia/specify_uuid_to_use.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 |   annotations:
 6 |     # You can run command: kubectl get node $node -o jsonpath='{.metadata.annotations.hami\.io/node-nvidia-register}' to get gpu-type
 7 |     # UUID is like GPU-03f69c50-207a-2038-9b45-23cac89cb67d
 8 |     nvidia.com/use-gpuuuid: "GPU-03f69c50-207a-2038-9b45-23cac89cb67d,GPU-03f69c50-207a-2038-9b45-23cac89cb67e" # Specify the card UUIDs for this job, separated by commas. The job will run on the specified cards
 9 |     # In this example, we want to run this job on GPU-03f69c50-207a-2038-9b45-23cac89cb67d or GPU-03f69c50-207a-2038-9b45-23cac89cb67e
10 | spec:
11 |   containers:
12 |     - name: ubuntu-container
13 |       image: ubuntu:18.04
14 |       command: ["bash", "-c", "sleep 86400"]
15 |       resources:
16 |         limits:
17 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
18 | 


--------------------------------------------------------------------------------
/examples/nvidia/use_as_normal.yaml:
--------------------------------------------------------------------------------
 1 | # Gpu-pod1 and gpu-pod2 will NOT share the same GPU
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: gpu-pod1
 6 | spec:
 7 |   containers:
 8 |     - name: ubuntu-container
 9 |       image: ubuntu:18.04
10 |       command: ["bash", "-c", "sleep 86400"]
11 |       resources:
12 |         limits:
13 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
14 | ---
15 | apiVersion: v1
16 | kind: Pod
17 | metadata:
18 |   name: gpu-pod2
19 | spec:
20 |   containers:
21 |     - name: ubuntu-container
22 |       image: ubuntu:18.04
23 |       command: ["bash", "-c", "sleep 86400"]
24 |       resources:
25 |         limits:
26 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs


--------------------------------------------------------------------------------
/examples/nvidia/use_exclusive_card.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod1
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
13 |           nvidia.com/gpumem-percentage: 100 # identifies 100% GPU memory each physical GPU allocates to the pod （Optional,Integer）
14 |           nvidia.com/gpucores: 100 # identifies 100% GPU GPU core each physical GPU allocates to the pod（Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/examples/nvidia/use_memory_fraction.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gpu-pod
 5 | spec:
 6 |   containers:
 7 |     - name: ubuntu-container
 8 |       image: ubuntu:18.04
 9 |       command: ["bash", "-c", "sleep 86400"]
10 |       resources:
11 |         limits:
12 |           nvidia.com/gpu: 2 # declare how many physical GPUs the pod needs
13 |           nvidia.com/gpumem-percentage: 50 # identifies 50% GPU memory each physical GPU allocates to the pod （Optional,Integer）
14 |           nvidia.com/gpucores: 30 # identifies 30% GPU GPU core each physical GPU allocates to the pod （Optional,Integer)
15 | 


--------------------------------------------------------------------------------
/examples/nvidia/use_sharing_card.yaml:
--------------------------------------------------------------------------------
 1 | # Gpu-pod1 and gpu-pod2 could share the same GPU
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: gpu-pod1
 6 | spec:
 7 |   containers:
 8 |     - name: ubuntu-container
 9 |       image: ubuntu:18.04
10 |       command: ["bash", "-c", "sleep 86400"]
11 |       resources:
12 |         limits:
13 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
14 |           nvidia.com/gpumem-percentage: 40 # identifies 40% GPU memory each physical GPU allocates to the pod （Optional,Integer)
15 |           nvidia.com/gpucores: 60 # identifies 60% GPU GPU core each physical GPU allocates to the pod （Optional,Integer)
16 | ---
17 | apiVersion: v1
18 | kind: Pod
19 | metadata:
20 |   name: gpu-pod2
21 | spec:
22 |   containers:
23 |     - name: ubuntu-container
24 |       image: ubuntu:18.04
25 |       command: ["bash", "-c", "sleep 86400"]
26 |       resources:
27 |         limits:
28 |           nvidia.com/gpu: 1 # declare how many physical GPUs the pod needs
29 |           nvidia.com/gpumem-percentage: 60 # identifies 60% GPU memory each physical GPU allocates to the pod （Optional,Integer)
30 |           nvidia.com/gpucores: 40 # identifies 40% GPU GPU core each physical GPU allocates to the pod （Optional,Integer)
31 | 


--------------------------------------------------------------------------------
/hack/.import-aliases:
--------------------------------------------------------------------------------
 1 | {
 2 |     "k8s.io/api/admissionregistration/v1": "admissionregistrationv1",
 3 |     "k8s.io/api/admissionregistration/v1beta1": "admissionregistrationv1beta1",
 4 |     "k8s.io/api/admission/v1beta1": "admissionv1beta1",
 5 |     "k8s.io/api/admission/v1": "admissionv1",
 6 |     "k8s.io/api/apps/v1": "appsv1",
 7 |     "k8s.io/api/apps/v1beta1": "appsv1beta1",
 8 |     "k8s.io/api/apps/v1beta2": "appsv1beta2",
 9 |     "k8s.io/api/authentication/v1": "authenticationv1",
10 |     "k8s.io/api/authentication/v1beta1": "authenticationv1beta1",
11 |     "k8s.io/api/authorization/v1": "authorizationv1",
12 |     "k8s.io/api/authorization/v1beta1": "authorizationv1beta1",
13 |     "k8s.io/api/autoscaling/v1": "autoscalingv1",
14 |     "k8s.io/api/autoscaling/v2": "autoscalingv2",
15 |     "k8s.io/api/batch/v1": "batchv1",
16 |     "k8s.io/api/batch/v1beta1": "batchv1beta1",
17 |     "k8s.io/api/certificates/v1beta1": "certificatesv1beta1",
18 |     "k8s.io/api/coordination/v1": "coordinationv1",
19 |     "k8s.io/api/coordination/v1beta1": "coordinationv1beta1",
20 |     "k8s.io/api/core/v1": "corev1",
21 |     "k8s.io/api/discovery/v1": "discoveryv1",
22 |     "k8s.io/api/events/v1": "eventsv1",
23 |     "k8s.io/api/events/v1beta1": "eventsv1beta1",
24 |     "k8s.io/api/extensions/v1beta1": "extensionsv1beta1",
25 |     "k8s.io/api/imagepolicy/v1alpha1": "imagepolicyv1alpha1",
26 |     "k8s.io/api/networking/v1": "networkingv1",
27 |     "k8s.io/api/networking/v1beta1": "networkingv1beta1",
28 |     "k8s.io/api/node/v1alpha1": "nodev1alpha1",
29 |     "k8s.io/api/node/v1beta1": "nodev1beta1",
30 |     "k8s.io/api/node/v1": "nodev1",
31 |     "k8s.io/api/policy/v1": "policyv1",
32 |     "k8s.io/api/policy/v1beta1": "policyv1beta1",
33 |     "k8s.io/api/rbac/v1": "rbacv1",
34 |     "k8s.io/api/rbac/v1alpha1": "rbacv1alpha1",
35 |     "k8s.io/api/rbac/v1beta1": "rbacv1beta1",
36 |     "k8s.io/api/scheduling/v1": "schedulingv1",
37 |     "k8s.io/api/scheduling/v1alpha1": "schedulingv1alpha1",
38 |     "k8s.io/api/scheduling/v1beta1": "schedulingv1beta1",
39 |     "k8s.io/api/storage/v1": "storagev1",
40 |     "k8s.io/api/storage/v1alpha1": "storagev1alpha1",
41 |     "k8s.io/api/storage/v1beta1": "storagev1beta1",
42 |     "k8s.io/apimachinery/pkg/api/errors": "apierrors",
43 |     "k8s.io/apimachinery/pkg/apis/meta/v1": "metav1",
44 |     "k8s.io/kubelet/apis/stats/v1alpha1": "kubeletstatsv1alpha1",
45 |     "k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha": "kubeletdevicepluginv1alpha",
46 |     "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1": "kubeletdevicepluginv1beta1",
47 |     "k8s.io/kubelet/pkg/apis/pluginregistration/v1": "kubeletpluginregistrationv1",
48 |     "k8s.io/kubelet/pkg/apis/pluginregistration/v1alpha1": "kubeletpluginregistrationv1alpha1",
49 |     "k8s.io/kubelet/pkg/apis/pluginregistration/v1beta1": "kubeletpluginregistrationv1beta1",
50 |     "k8s.io/kubelet/pkg/apis/podresources/v1alpha1": "kubeletpodresourcesv1alpha1"
51 | }
52 | 


--------------------------------------------------------------------------------
/hack/boilerplate/boilerplate.go.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | 


--------------------------------------------------------------------------------
/hack/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2024 HAMi Authors
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | set -e
18 | [[ -z ${SHORT_VERSION} ]] && SHORT_VERSION=$(git rev-parse --abbrev-ref HEAD)
19 | [[ -z ${COMMIT_CODE} ]] && COMMIT_CODE=$(git describe --abbrev=100 --always)
20 | 
21 | export SHORT_VERSION
22 | export COMMIT_CODE
23 | export VERSION="${SHORT_VERSION}-${COMMIT_CODE}"
24 | export LATEST_VERSION="latest"
25 | export GOLANG_IMAGE="golang:1.22.5-bullseye"
26 | export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04"
27 | export DEST_DIR="/usr/local"
28 | 
29 | IMAGE=${IMAGE-"projecthami/hami"}
30 | 
31 | function go_build() {
32 |   [[ -z "$J" ]] && J=$(nproc | awk '{print int(($0 + 1)/ 2)}')
33 |   make -j$J
34 | }
35 | 
36 | function docker_build() {
37 |     docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
38 |     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}"
39 |     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}"
40 | }
41 | 
42 | function docker_push() {
43 |     #docker push "${IMAGE}:${VERSION}"
44 |     docker push "${IMAGE}:${SHORT_VERSION}"
45 |     docker push "${IMAGE}:${LATEST_VERSION}"
46 | }
47 | 
48 | go_build
49 | docker_build
50 | docker_push
51 | 


--------------------------------------------------------------------------------
/hack/e2e-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | set -x
21 | 
22 | E2E_TYPE=${1:-"pullrequest"}
23 | KUBE_CONF=${2:-""}
24 | 
25 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
26 | source "${REPO_ROOT}"/hack/util.sh
27 | 
28 | if util::cmd_exist ginkgo; then
29 |   echo "Using ginkgo version:"
30 |   ginkgo version
31 | else
32 |   go install github.com/onsi/ginkgo/v2/ginkgo
33 |   go get github.com/onsi/gomega/...
34 |   ginkgo version
35 | fi
36 | 
37 | 
38 | if [ -z "${KUBE_CONF}" ]; then
39 |    echo "Error: KUBE_CONF environment variable is not set."
40 |    return 1
41 | fi
42 | 
43 | # Run e2e
44 | if [ "${E2E_TYPE}" == "pullrequest" ] || [ "${E2E_TYPE}" == "release" ]; then
45 |    ginkgo -v -r --fail-fast  ./test/e2e/ --kubeconfig="${KUBE_CONF}"
46 |    if [ $? -ne 0 ]; then
47 |        echo "Error: ginkgo command failed."
48 |        return 1
49 |    fi
50 | else
51 |    echo "Invalid E2E Type: ${E2E_TYPE}"
52 |    return 1
53 | fi
54 | 


--------------------------------------------------------------------------------
/hack/kubeconfig-demo.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | clusters:
 3 | - cluster:
 4 |     server: http://localhost:8080
 5 |   name: local-server
 6 | contexts:
 7 | - context:
 8 |     cluster: local-server
 9 |     namespace: the-right-prefix
10 |     user: myself
11 |   name: default-context
12 | current-context: default-context
13 | kind: Config
14 | preferences: {}
15 | users:
16 | - name: myself
17 |   user:
18 |     password: secret
19 |     username: admin
20 | 


--------------------------------------------------------------------------------
/hack/tools/tools.go:
--------------------------------------------------------------------------------
 1 | //go:build tools
 2 | 
 3 | /*
 4 | Copyright 2024 The HAMi Authors.
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |     http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | 
19 | package tools
20 | 
21 | import (
22 | 	_ "golang.org/x/tools/cmd/goimports"
23 | )
24 | 


--------------------------------------------------------------------------------
/hack/unit-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | set -x
21 | 
22 | # init kubeconfig env
23 | kubeconfig_path="${HOME}/.kube"
24 | kubeconfig_file="${kubeconfig_path}/config"
25 | kubeconfig_demo="./hack/kubeconfig-demo.yaml"
26 | 
27 | echo "kubeconfig: ${kubeconfig_file}"
28 | 
29 | if [ ! -f "$kubeconfig_file" ]; then
30 |   echo "Generate fake kubeconfig"
31 |   if [ ! -d "${kubeconfig_path}" ]; then
32 |     trap 'rm -rf "$kubeconfig_path"' EXIT
33 |     mkdir -p "${kubeconfig_path}"
34 |     cp ${kubeconfig_demo} "${kubeconfig_file}"
35 |   else
36 |     trap 'rm -f "$kubeconfig_file"' EXIT
37 |     cp ${kubeconfig_demo} "${kubeconfig_file}"
38 |   fi
39 | else
40 |   echo "Use local kubeconfig"
41 | fi
42 | 
43 | mkdir -p ./_output/coverage/
44 | mergeF="./_output/coverage/merge.out"
45 | rm -f ${mergeF}
46 | cov_file="./_output/coverage/coverage_pkg.txt"
47 | go test $(go list ./pkg/... | grep -v ./pkg/device-plugin/...) -short --race -count=1 -covermode=atomic -coverprofile=${cov_file}
48 | cat $cov_file | grep -v mode: | grep -v pkg/version | grep -v fake | grep -v main.go >>${mergeF}
49 | #merge them
50 | echo "mode: atomic" >coverage.out
51 | cat ${mergeF} >>./_output/coverage/coverage.out
52 | go tool cover -func=coverage.out
53 | 


--------------------------------------------------------------------------------
/hack/update-generated-api.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2024 HAMi Authors
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | ROOT_DIR=$(dirname "${BASH_SOURCE[0]}")/..
19 | protoc -I${ROOT_DIR} --gofast_out=plugins=grpc:${ROOT_DIR} ${ROOT_DIR}/pkg/api/*.proto


--------------------------------------------------------------------------------
/hack/verify-all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
21 | 
22 | # Show progress
23 | set -x
24 | 
25 | # Orders are determined by two factors:
26 | # (1) Less Execution time item should be executed first.
27 | # (2) More likely to fail item should be executed first.
28 | 
29 | bash "$REPO_ROOT/hack/verify-staticcheck.sh"
30 | 
31 | bash "$REPO_ROOT/hack/verify-license.sh"
32 | 
33 | bash "$REPO_ROOT/hack/verify-import-aliases.sh"
34 | 


--------------------------------------------------------------------------------
/hack/verify-chart-version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
21 | cd "${REPO_ROOT}"
22 | 
23 | source "${REPO_ROOT}"/hack/util.sh
24 | 
25 | # install helm
26 | echo -n "Preparing: 'helm' existence check - "
27 | if util::cmd_exist helm; then
28 |   echo "passed"
29 | else
30 |   echo "installing helm"
31 |   util::install_helm
32 | fi
33 | 
34 | APP_VERSION=$(helm show chart ./charts/hami | grep '^appVersion' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ')
35 | VERSION=$(helm show chart ./charts/hami | grep '^version' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ')
36 | 
37 | if [[ ${APP_VERSION} != ${VERSION} ]]; then
38 |     echo "AppVersion of HAMi is ${APP_VERSION}, but version is ${VERSION}!"
39 |     exit 1
40 | fi
41 | 
42 | echo "Both appVersion and version is ${APP_VERSION}."
43 | 
44 | 


--------------------------------------------------------------------------------
/hack/verify-import-aliases.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
21 | cd "${SCRIPT_ROOT}"
22 | ROOT_PATH=$(pwd)
23 | 
24 | IMPORT_ALIASES_PATH="${ROOT_PATH}/hack/.import-aliases"
25 | INCLUDE_PATH="(${ROOT_PATH}/cmd|${ROOT_PATH}/pkg)"
26 | 
27 | ret=0
28 | # We can't directly install preferredimports by `go install` due to the go.mod issue:
29 | # go install k8s.io/kubernetes/cmd/preferredimports@v1.21.3: k8s.io/kubernetes@v1.21.3
30 | #   The go.mod file for the module providing named packages contains one or
31 | #   more replace directives. It must not contain directives that would cause
32 | #   it to be interpreted differently than if it were the main module.
33 | go run "${ROOT_PATH}/hack/tools/preferredimports/preferredimports.go" -import-aliases "${IMPORT_ALIASES_PATH}" -include-path "${INCLUDE_PATH}" "${ROOT_PATH}" || ret=$?
34 | if [[ $ret -ne 0 ]]; then
35 |   echo "!!! Please see hack/.import-aliases for the preferred aliases for imports." >&2
36 |   exit 1
37 | fi
38 | echo "Passed import-aliases verification."
39 | 


--------------------------------------------------------------------------------
/hack/verify-license.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | set -ex
20 | 
21 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
22 | cd "${REPO_ROOT}"
23 | 
24 | if [[ "$(which addlicense)" == "" ]]; then
25 |   go install github.com/google/addlicense@v1.1.1
26 | fi
27 | ADDLICENSE_BIN=$(which addlicense)
28 | 
29 | # verify presence of license headers and exit with non-zero code if missing
30 | missing_license_header_files="$($ADDLICENSE_BIN \
31 |   -check \
32 |   -ignore "benchmarks/**" \
33 |   -ignore "charts/**" \
34 |   -ignore "docs/**" \
35 |   -ignore "docker/**" \
36 |   -ignore "examples/**" \
37 |   -ignore "lib/**" \
38 |   -ignore "libvgpu/**" \
39 |   -ignore "third_party/**" \
40 |   -ignore "vendor/**" \
41 |   -ignore "_output/**" \
42 |   -ignore ".github/**" \
43 |   -ignore "**/*.md" \
44 |   -ignore "**/*.yaml" \
45 |   -ignore "**/*.yml" \
46 |   -ignore "**/*.json" \
47 |   -ignore ".idea/**" \
48 |   .)" || true
49 | 
50 | if [[ "$missing_license_header_files" ]]; then
51 |   echo "Files with no license header detected:"
52 |   echo "$missing_license_header_files"
53 |   echo "Please add all missing license headers."
54 |   exit 1
55 | fi
56 | 
57 | echo "Congratulations! All files have passed license header check."
58 | 


--------------------------------------------------------------------------------
/hack/verify-staticcheck.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2024 The HAMi Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o errexit
17 | set -o nounset
18 | set -o pipefail
19 | 
20 | REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
21 | GOLANGCI_LINT_VER="v2.1.1"
22 | 
23 | cd "${REPO_ROOT}"
24 | source "hack/util.sh"
25 | 
26 | if util::cmd_exist golangci-lint; then
27 |   echo "Using golangci-lint version:"
28 |   golangci-lint version
29 | else
30 |   echo "Installing golangci-lint ${GOLANGCI_LINT_VER}"
31 |   # https://golangci-lint.run/usage/install/#other-ci
32 |   curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s -- -b $(go env GOPATH)/bin ${GOLANGCI_LINT_VER}
33 | fi
34 | 
35 | if golangci-lint run; then
36 |   echo 'Congratulations!  All Go source files have passed staticcheck.'
37 | else
38 |   echo # print one empty line, separate from warning messages.
39 |   echo 'Please review the above warnings.'
40 |   echo 'If the above warnings do not make sense, feel free to file an issue.'
41 |   exit 1
42 | fi
43 | 


--------------------------------------------------------------------------------
/imgs/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/arch.png


--------------------------------------------------------------------------------
/imgs/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark.png


--------------------------------------------------------------------------------
/imgs/benchmark_inf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark_inf.png


--------------------------------------------------------------------------------
/imgs/benchmark_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/benchmark_train.png


--------------------------------------------------------------------------------
/imgs/cncf-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/cncf-logo.png


--------------------------------------------------------------------------------
/imgs/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/example.png


--------------------------------------------------------------------------------
/imgs/hami-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-arch.png


--------------------------------------------------------------------------------
/imgs/hami-arch.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-arch.pptx


--------------------------------------------------------------------------------
/imgs/hami-graph-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-graph-color.png


--------------------------------------------------------------------------------
/imgs/hami-horizontal-colordark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-horizontal-colordark.png


--------------------------------------------------------------------------------
/imgs/hami-vgpu-metrics-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hami-vgpu-metrics-dashboard.png


--------------------------------------------------------------------------------
/imgs/hard_limit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/hard_limit.jpg


--------------------------------------------------------------------------------
/imgs/metax_binpack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_binpack.png


--------------------------------------------------------------------------------
/imgs/metax_spread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_spread.png


--------------------------------------------------------------------------------
/imgs/metax_topo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/metax_topo.png


--------------------------------------------------------------------------------
/imgs/release-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Project-HAMi/HAMi/60fbb312516058fdc8c499a1f4d31244193b8d03/imgs/release-process.png


--------------------------------------------------------------------------------
/lib/nvidia/ld.so.preload:
--------------------------------------------------------------------------------
1 | /usr/local/vgpu/libvgpu.so


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package cdi
34 | 
35 | // Interface provides the API to the 'cdi' package
36 | //
37 | //go:generate moq -stub -out api_mock.go . Interface
38 | type Interface interface {
39 | 	CreateSpecFile() error
40 | 	QualifiedName(string, string) string
41 | }
42 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package cdi
34 | 
35 | import (
36 | 	"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
37 | 
38 | 	"k8s.io/klog/v2"
39 | )
40 | 
41 | // New is a factory method that creates a CDI handler for creating CDI specs.
42 | func New(opts ...Option) (Interface, error) {
43 | 	infolib := info.New()
44 | 
45 | 	hasNVML, _ := infolib.HasNvml()
46 | 	if !hasNVML {
47 | 		klog.Warning("No valid resources detected, creating a null CDI handler")
48 | 		return NewNullHandler(), nil
49 | 	}
50 | 
51 | 	return newHandler(opts...)
52 | }
53 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package cdi
34 | 
35 | import (
36 | 	"k8s.io/klog/v2"
37 | )
38 | 
39 | type null struct{}
40 | 
41 | var _ Interface = &null{}
42 | 
43 | // NewNullHandler returns an instance of the 'cdi' interface that can
44 | // be used when CDI specs are not required.
45 | func NewNullHandler() Interface {
46 | 	return &null{}
47 | }
48 | 
49 | // CreateSpecFile is a no-op for the null handler.
50 | func (n *null) CreateSpecFile() error {
51 | 	return nil
52 | }
53 | 
54 | // QualifiedName is a no-op for the null handler. A error message is logged
55 | // inidicating this should never be called for the null handler.
56 | func (n *null) QualifiedName(class string, id string) string {
57 | 	klog.Error("cannot return a qualified CDI device name with the null CDI handler")
58 | 	return ""
59 | }
60 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package info
34 | 
35 | import "strings"
36 | 
37 | // version must be set by go build's -X main.version= option in the Makefile.
38 | var version = "unknown"
39 | 
40 | // gitCommit will be the hash that the binary was built from
41 | // and will be populated by the Makefile.
42 | var gitCommit = ""
43 | 
44 | // GetVersionParts returns the different version components.
45 | func GetVersionParts() []string {
46 | 	v := []string{version}
47 | 
48 | 	if gitCommit != "" {
49 | 		v = append(v, "commit: "+gitCommit)
50 | 	}
51 | 
52 | 	return v
53 | }
54 | 
55 | // GetVersionString returns the string representation of the version.
56 | func GetVersionString(more ...string) string {
57 | 	v := append(GetVersionParts(), more...)
58 | 	return strings.Join(v, "\n")
59 | }
60 | 
61 | // GetVersion returns the version of the binary.
62 | func GetVersion() string {
63 | 	return version
64 | }
65 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package plugin
34 | 
35 | import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm"
36 | 
37 | // Interface defines the API for the plugin package
38 | type Interface interface {
39 | 	Devices() rm.Devices
40 | 	Start() error
41 | 	Stop() error
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package manager
34 | 
35 | import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
36 | 
37 | // Interface defines the API for the plugin manager package
38 | type Interface interface {
39 | 	GetPlugins() ([]plugin.Interface, error)
40 | 	CreateCDISpecFile() error
41 | }
42 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package manager
34 | 
35 | import (
36 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
37 | )
38 | 
39 | type null struct{}
40 | 
41 | // GetPlugins returns an empty set of Plugins for the null manager
42 | func (m *null) GetPlugins() ([]plugin.Interface, error) {
43 | 	return nil, nil
44 | }
45 | 
46 | // CreateCDISpecFile creates the spec is a no-op for the null plugin
47 | func (m *null) CreateCDISpecFile() error {
48 | 	return nil
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package manager
34 | 
35 | import (
36 | 	"fmt"
37 | 
38 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
39 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm"
40 | )
41 | 
42 | type nvmlmanager manager
43 | 
44 | // GetPlugins returns the plugins associated with the NVML resources available on the node
45 | func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) {
46 | 	sConfig, mode, err := plugin.LoadNvidiaDevicePluginConfig()
47 | 	if err != nil {
48 | 		return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err)
49 | 	}
50 | 
51 | 	rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config)
52 | 	if err != nil {
53 | 		return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err)
54 | 	}
55 | 
56 | 	var plugins []plugin.Interface
57 | 	for _, r := range rms {
58 | 		plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled, sConfig, mode))
59 | 	}
60 | 	return plugins, nil
61 | }
62 | 
63 | // CreateCDISpecFile creates forwards the request to the CDI handler
64 | func (m *nvmlmanager) CreateCDISpecFile() error {
65 | 	return m.cdiHandler.CreateSpecFile()
66 | }
67 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package manager
34 | 
35 | import (
36 | 	"github.com/NVIDIA/go-nvlib/pkg/nvml"
37 | 
38 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi"
39 | 	"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
40 | )
41 | 
42 | // Option is a function that configures a manager
43 | type Option func(*manager)
44 | 
45 | // WithCDIEnabled sets whether CDI is enabled for the manager
46 | func WithCDIEnabled(enabled bool) Option {
47 | 	return func(m *manager) {
48 | 		m.cdiEnabled = enabled
49 | 	}
50 | }
51 | 
52 | // WithCDIHandler sets the CDI handler for the manager
53 | func WithCDIHandler(handler cdi.Interface) Option {
54 | 	return func(m *manager) {
55 | 		m.cdiHandler = handler
56 | 	}
57 | }
58 | 
59 | // WithNVML sets the NVML handler for the manager
60 | func WithNVML(nvmllib nvml.Interface) Option {
61 | 	return func(m *manager) {
62 | 		m.nvmllib = nvmllib
63 | 	}
64 | }
65 | 
66 | // WithFailOnInitError sets whether the manager should fail on initialization errors
67 | func WithFailOnInitError(failOnInitError bool) Option {
68 | 	return func(m *manager) {
69 | 		m.failOnInitError = failOnInitError
70 | 	}
71 | }
72 | 
73 | // WithMigStrategy sets the MIG strategy for the manager
74 | func WithMigStrategy(migStrategy string) Option {
75 | 	return func(m *manager) {
76 | 		m.migStrategy = migStrategy
77 | 	}
78 | }
79 | 
80 | // WithConfig sets the config reference for the manager
81 | func WithConfig(config *nvidia.DeviceConfig) Option {
82 | 	return func(m *manager) {
83 | 		m.config = config
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package manager
34 | 
35 | import (
36 | 	"fmt"
37 | 
38 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin"
39 | 	"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm"
40 | )
41 | 
42 | type tegramanager manager
43 | 
44 | // GetPlugins returns the plugins associated with the NVML resources available on the node
45 | func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) {
46 | 	sConfig, mode, err := plugin.LoadNvidiaDevicePluginConfig()
47 | 	if err != nil {
48 | 		return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err)
49 | 	}
50 | 
51 | 	rms, err := rm.NewTegraResourceManagers(m.config)
52 | 	if err != nil {
53 | 		return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err)
54 | 	}
55 | 
56 | 	var plugins []plugin.Interface
57 | 	for _, r := range rms {
58 | 		plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled, sConfig, mode))
59 | 	}
60 | 	return plugins, nil
61 | }
62 | 
63 | // CreateCDISpecFile creates the spec is a no-op for the tegra plugin
64 | func (m *tegramanager) CreateCDISpecFile() error {
65 | 	return nil
66 | }
67 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package plugin
34 | 
35 | import "testing"
36 | 
37 | func Test_parseNvidiaNumaInfo(t *testing.T) {
38 | 
39 | 	tests := []struct {
40 | 		name          string
41 | 		idx           int
42 | 		nvidiaTopoStr string
43 | 		want          int
44 | 		wantErr       bool
45 | 	}{
46 | 		{
47 | 			name: "single Tesla P4 NUMA",
48 | 			idx:  0,
49 | 			nvidiaTopoStr: `GPU0    CPU Affinity    NUMA Affinity ...
50 |                             ...`,
51 | 			want:    0,
52 | 			wantErr: false,
53 | 		},
54 | 		{
55 | 			name: "two Tesla P4 NUMA topo with index 0",
56 | 			idx:  0,
57 | 			nvidiaTopoStr: `GPU0    GPU1    CPU Affinity    NUMA Affinity ...
58 |                             ...`,
59 | 			want:    0,
60 | 			wantErr: false,
61 | 		},
62 | 		{
63 | 			name: "two Tesla P4 NUMA topo with index 1",
64 | 			idx:  1,
65 | 			nvidiaTopoStr: `GPU0    GPU1    CPU Affinity    NUMA Affinity ...
66 |                             ...`,
67 | 			want:    0,
68 | 			wantErr: false,
69 | 		},
70 | 		{
71 | 			name: "NUMA Affinity is empty",
72 | 			idx:  0,
73 | 			nvidiaTopoStr: `GPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID
74 | GPU0	X`,
75 | 			want:    0,
76 | 			wantErr: false,
77 | 		},
78 | 	}
79 | 
80 | 	for _, tt := range tests {
81 | 		t.Run(tt.name, func(t *testing.T) {
82 | 			got, err := parseNvidiaNumaInfo(tt.idx, tt.nvidiaTopoStr)
83 | 			if (err != nil) != tt.wantErr {
84 | 				t.Errorf("parseNvidiaNumaInfo() error = %v, wantErr %v", err, tt.wantErr)
85 | 				return
86 | 			}
87 | 			if got != tt.want {
88 | 				t.Errorf("parseNvidiaNumaInfo() got = %v, want %v", got, tt.want)
89 | 			}
90 | 		})
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package rm
34 | 
35 | // int8Slice wraps an []int8 with more functions.
36 | type int8Slice []int8
37 | 
38 | // String turns a nil terminated int8Slice into a string
39 | func (s int8Slice) String() string {
40 | 	var b []byte
41 | 	for _, c := range s {
42 | 		if c == 0 {
43 | 			break
44 | 		}
45 | 		b = append(b, byte(c))
46 | 	}
47 | 	return string(b)
48 | }
49 | 
50 | // uintPtr returns a *uint from a uint32
51 | func uintPtr(c uint32) *uint {
52 | 	i := uint(c)
53 | 	return &i
54 | }
55 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package rm
34 | 
35 | import (
36 | 	"fmt"
37 | 
38 | 	"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
39 | )
40 | 
41 | const (
42 | 	tegraDeviceName = "tegra"
43 | )
44 | 
45 | // buildTegraDeviceMap creates a DeviceMap for the tegra devices in the sytesm.
46 | // NOTE: At present only a single tegra device is expected.
47 | func buildTegraDeviceMap(config *nvidia.DeviceConfig) (DeviceMap, error) {
48 | 	devices := make(DeviceMap)
49 | 
50 | 	name := tegraDeviceName
51 | 	i := 0
52 | 	for _, resource := range config.Resources.GPUs {
53 | 		if resource.Pattern.Matches(name) {
54 | 			index := fmt.Sprintf("%d", i)
55 | 			err := devices.setEntry(resource.Name, index, &tegraDevice{})
56 | 			if err != nil {
57 | 				return nil, err
58 | 			}
59 | 			i++
60 | 		}
61 | 
62 | 	}
63 | 	return devices, nil
64 | }
65 | 
66 | type tegraDevice struct{}
67 | 
68 | var _ deviceInfo = (*tegraDevice)(nil)
69 | 
70 | // GetUUID returns the UUID of the tegra device.
71 | // TODO: This is currently hardcoded to `tegra`
72 | func (d *tegraDevice) GetUUID() (string, error) {
73 | 	return tegraDeviceName, nil
74 | }
75 | 
76 | // GetPaths returns the paths for a tegra device.
77 | // A tegra device does not have paths associated with it.
78 | func (d *tegraDevice) GetPaths() ([]string, error) {
79 | 	return nil, nil
80 | }
81 | 
82 | // GetNumaNode always returns unsupported for a Tegra device
83 | func (d *tegraDevice) GetNumaNode() (bool, int, error) {
84 | 	return false, -1, nil
85 | }
86 | 


--------------------------------------------------------------------------------
/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  *
 4 |  * The HAMi Contributors require contributions made to
 5 |  * this file be licensed under the Apache-2.0 license or a
 6 |  * compatible open source license.
 7 |  */
 8 | 
 9 | /*
10 |  * Licensed to NVIDIA CORPORATION under one or more contributor
11 |  * license agreements. See the NOTICE file distributed with
12 |  * this work for additional information regarding copyright
13 |  * ownership. NVIDIA CORPORATION licenses this file to you under
14 |  * the Apache License, Version 2.0 (the "License"); you may
15 |  * not use this file except in compliance with the License.
16 |  * You may obtain a copy of the License at
17 |  *
18 |  *     http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing,
21 |  * software distributed under the License is distributed on an
22 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23 |  * KIND, either express or implied.  See the License for the
24 |  * specific language governing permissions and limitations
25 |  * under the License.
26 |  */
27 | 
28 | /*
29 |  * Modifications Copyright The HAMi Authors. See
30 |  * GitHub history for details.
31 |  */
32 | 
33 | package rm
34 | 
35 | type wslDevice nvmlDevice
36 | 
37 | var _ deviceInfo = (*wslDevice)(nil)
38 | 
39 | // GetUUID returns the UUID of the device
40 | func (d wslDevice) GetUUID() (string, error) {
41 | 	return nvmlDevice(d).GetUUID()
42 | }
43 | 
44 | // GetPaths returns the paths for a tegra device.
45 | func (d wslDevice) GetPaths() ([]string, error) {
46 | 	return []string{"/dev/dxg"}, nil
47 | }
48 | 
49 | // GetNumaNode returns the NUMA node associated with the GPU device
50 | func (d wslDevice) GetNumaNode() (bool, int, error) {
51 | 	return nvmlDevice(d).GetNumaNode()
52 | }
53 | 


--------------------------------------------------------------------------------
/pkg/device/ascend/vnpu.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package ascend
18 | 
19 | type Template struct {
20 | 	Name   string `yaml:"name"`
21 | 	Memory int64  `yaml:"memory"`
22 | 	AICore int32  `yaml:"aiCore,omitempty"`
23 | 	AICPU  int32  `yaml:"aiCPU,omitempty"`
24 | }
25 | 
26 | type VNPUConfig struct {
27 | 	CommonWord         string     `yaml:"commonWord"`
28 | 	ChipName           string     `yaml:"chipName"`
29 | 	ResourceName       string     `yaml:"resourceName"`
30 | 	ResourceMemoryName string     `yaml:"resourceMemoryName"`
31 | 	MemoryAllocatable  int64      `yaml:"memoryAllocatable"`
32 | 	MemoryCapacity     int64      `yaml:"memoryCapacity"`
33 | 	AICore             int32      `yaml:"aiCore"`
34 | 	AICPU              int32      `yaml:"aiCPU"`
35 | 	Templates          []Template `yaml:"templates"`
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/device/metax/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | 	http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package metax
18 | 
19 | import "flag"
20 | 
21 | type MetaxConfig struct {
22 | 	// GPU
23 | 	ResourceCountName string `yaml:"resourceCountName"`
24 | 
25 | 	// SGPU
26 | 	ResourceVCountName  string `yaml:"resourceVCountName"`
27 | 	ResourceVMemoryName string `yaml:"resourceVMemoryName"`
28 | 	ResourceVCoreName   string `yaml:"resourceVCoreName"`
29 | }
30 | 
31 | func ParseConfig(fs *flag.FlagSet) {
32 | 	// GPU
33 | 	fs.StringVar(&MetaxResourceCount, "metax-name", "metax-tech.com/gpu", "metax resource count")
34 | 
35 | 	// SGPU
36 | 	fs.StringVar(&MetaxResourceNameVCount, "metax-vcount", "metax-tech.com/sgpu", "metax vcount name")
37 | 	fs.StringVar(&MetaxResourceNameVCore, "metax-vcore", "metax-tech.com/vcore", "metax vcore name")
38 | 	fs.StringVar(&MetaxResourceNameVMemory, "metax-vmemory", "metax-tech.com/vmemory", "metax vmemory name")
39 | }
40 | 


--------------------------------------------------------------------------------
/pkg/k8sutil/pod.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package k8sutil
18 | 
19 | import (
20 | 	"github.com/Project-HAMi/HAMi/pkg/device"
21 | 	"github.com/Project-HAMi/HAMi/pkg/util"
22 | 
23 | 	corev1 "k8s.io/api/core/v1"
24 | 	"k8s.io/klog/v2"
25 | )
26 | 
27 | func Resourcereqs(pod *corev1.Pod) (counts util.PodDeviceRequests) {
28 | 	counts = make(util.PodDeviceRequests, len(pod.Spec.Containers))
29 | 	klog.V(4).InfoS("Processing resource requirements",
30 | 		"pod", klog.KObj(pod),
31 | 		"containerCount", len(pod.Spec.Containers))
32 | 	//Count Nvidia GPU
33 | 	cnt := int32(0)
34 | 	for i := range pod.Spec.Containers {
35 | 		devices := device.GetDevices()
36 | 		counts[i] = make(util.ContainerDeviceRequests)
37 | 		klog.V(5).InfoS("Processing container resources",
38 | 			"pod", klog.KObj(pod),
39 | 			"containerIndex", i,
40 | 			"containerName", pod.Spec.Containers[i].Name)
41 | 		for idx, val := range devices {
42 | 			request := val.GenerateResourceRequests(&pod.Spec.Containers[i])
43 | 			if request.Nums > 0 {
44 | 				cnt += request.Nums
45 | 				counts[i][idx] = val.GenerateResourceRequests(&pod.Spec.Containers[i])
46 | 			}
47 | 		}
48 | 	}
49 | 	if cnt == 0 {
50 | 		klog.V(4).InfoS("No device requests found", "pod", klog.KObj(pod))
51 | 	} else {
52 | 		klog.V(4).InfoS("Resource requirements collected", "pod", klog.KObj(pod), "requests", counts)
53 | 	}
54 | 	return counts
55 | }
56 | 
57 | func IsPodInTerminatedState(pod *corev1.Pod) bool {
58 | 	return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded
59 | }
60 | 
61 | func AllContainersCreated(pod *corev1.Pod) bool {
62 | 	return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers)
63 | }
64 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | // Runtime is an interface for a runtime shim. The Exec method accepts a list
20 | // of command line arguments, and returns an error / nil.
21 | type Runtime interface {
22 | 	Exec([]string) error
23 | }
24 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime_exec.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | 	"syscall"
23 | 
24 | 	log "github.com/sirupsen/logrus"
25 | )
26 | 
27 | // SyscallExecRuntime wraps the path that a binary and defines the semantics for how to exec into it.
28 | // This can be used to wrap an OCI-compliant low-level runtime binary, allowing it to be used through the
29 | // Runtime internface.
30 | type SyscallExecRuntime struct {
31 | 	logger *log.Logger
32 | 	path   string
33 | 	// exec is used for testing. This defaults to syscall.Exec
34 | 	exec func(argv0 string, argv []string, envv []string) error
35 | }
36 | 
37 | var _ Runtime = (*SyscallExecRuntime)(nil)
38 | 
39 | // NewSyscallExecRuntime creates a SyscallExecRuntime for the specified path with the standard logger.
40 | func NewSyscallExecRuntime(path string) (Runtime, error) {
41 | 	return NewSyscallExecRuntimeWithLogger(log.StandardLogger(), path)
42 | }
43 | 
44 | // NewSyscallExecRuntimeWithLogger creates a SyscallExecRuntime for the specified logger and path.
45 | func NewSyscallExecRuntimeWithLogger(logger *log.Logger, path string) (Runtime, error) {
46 | 	info, err := os.Stat(path)
47 | 	if err != nil {
48 | 		return nil, fmt.Errorf("invalid path '%v': %v", path, err)
49 | 	}
50 | 	if info.IsDir() || info.Mode()&0111 == 0 {
51 | 		return nil, fmt.Errorf("specified path '%v' is not an executable file", path)
52 | 	}
53 | 
54 | 	shim := SyscallExecRuntime{
55 | 		logger: logger,
56 | 		path:   path,
57 | 		exec:   syscall.Exec,
58 | 	}
59 | 
60 | 	return &shim, nil
61 | }
62 | 
63 | // Exec exces into the binary at the path from the SyscallExecRuntime struct, passing it the supplied arguments
64 | // after ensuring that the first argument is the path of the target binary.
65 | func (s SyscallExecRuntime) Exec(args []string) error {
66 | 	runtimeArgs := []string{s.path}
67 | 	if len(args) > 1 {
68 | 		runtimeArgs = append(runtimeArgs, args[1:]...)
69 | 	}
70 | 
71 | 	err := s.exec(s.path, runtimeArgs, os.Environ())
72 | 	if err != nil {
73 | 		return fmt.Errorf("could not exec '%v': %v", s.path, err)
74 | 	}
75 | 
76 | 	// syscall.Exec is not expected to return. This is an error state regardless of whether
77 | 	// err is nil or not.
78 | 	return fmt.Errorf("unexpected return from exec '%v'", s.path)
79 | }
80 | 


--------------------------------------------------------------------------------
/pkg/oci/runtime_mock.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | // MockExecRuntime wraps a SyscallExecRuntime, intercepting the exec call for testing.
20 | type MockExecRuntime struct {
21 | 	SyscallExecRuntime
22 | 	execMock
23 | }
24 | 
25 | // WithMockExec wraps a specified SyscallExecRuntime with a mocked exec function for testing.
26 | func WithMockExec(e SyscallExecRuntime, execResult error) *MockExecRuntime {
27 | 	m := MockExecRuntime{
28 | 		SyscallExecRuntime: e,
29 | 		execMock:           execMock{result: execResult},
30 | 	}
31 | 	// overrdie the exec function to the mocked exec function.
32 | 	m.SyscallExecRuntime.exec = m.execMock.exec
33 | 	return &m
34 | }
35 | 
36 | type execMock struct {
37 | 	argv0  string
38 | 	argv   []string
39 | 	envv   []string
40 | 	result error
41 | }
42 | 
43 | func (m *execMock) exec(argv0 string, argv []string, envv []string) error {
44 | 	m.argv0 = argv0
45 | 	m.argv = argv
46 | 	m.envv = envv
47 | 
48 | 	return m.result
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/oci/spec_mock.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | */
16 | 
17 | package oci
18 | 
19 | import (
20 | 	oci "github.com/opencontainers/runtime-spec/specs-go"
21 | )
22 | 
23 | // MockSpec provides a simple mock for an OCI spec to be used in testing.
24 | // It also implements the SpecModifier interface.
25 | type MockSpec struct {
26 | 	*oci.Spec
27 | 	MockLoad   mockFunc
28 | 	MockFlush  mockFunc
29 | 	MockModify mockFunc
30 | }
31 | 
32 | var _ Spec = (*MockSpec)(nil)
33 | 
34 | // NewMockSpec constructs a MockSpec to be used in testing as a Spec.
35 | func NewMockSpec(spec *oci.Spec, flushResult error, modifyResult error) *MockSpec {
36 | 	s := MockSpec{
37 | 		Spec:       spec,
38 | 		MockFlush:  mockFunc{result: flushResult},
39 | 		MockModify: mockFunc{result: modifyResult},
40 | 	}
41 | 
42 | 	return &s
43 | }
44 | 
45 | // Load invokes the mocked Load function to return the predefined error / result.
46 | func (s *MockSpec) Load() error {
47 | 	return s.MockLoad.call()
48 | }
49 | 
50 | // Flush invokes the mocked Load function to return the predefined error / result.
51 | func (s *MockSpec) Flush() error {
52 | 	return s.MockFlush.call()
53 | }
54 | 
55 | // Modify applies the specified SpecModifier to the spec and invokes the
56 | // mocked modify function to return the predefined error / result.
57 | func (s *MockSpec) Modify(f SpecModifier) error {
58 | 	f(s.Spec)
59 | 	return s.MockModify.call()
60 | }
61 | 
62 | type mockFunc struct {
63 | 	Callcount int
64 | 	result    error
65 | }
66 | 
67 | func (m *mockFunc) call() error {
68 | 	m.Callcount++
69 | 	return m.result
70 | }
71 | 


--------------------------------------------------------------------------------
/pkg/scheduler/config/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package config
18 | 
19 | import "github.com/Project-HAMi/HAMi/pkg/util"
20 | 
21 | var (
22 | 	QPS                float32
23 | 	Burst              int
24 | 	Timeout            int
25 | 	HTTPBind           string
26 | 	SchedulerName      string
27 | 	MetricsBindAddress string
28 | 
29 | 	DefaultMem         int32
30 | 	DefaultCores       int32
31 | 	DefaultResourceNum int32
32 | 
33 | 	// NodeSchedulerPolicy is config this scheduler node to use `binpack` or `spread`. default value is binpack.
34 | 	NodeSchedulerPolicy = util.NodeSchedulerPolicyBinpack.String()
35 | 	// GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread.
36 | 	GPUSchedulerPolicy = util.GPUSchedulerPolicySpread.String()
37 | 
38 | 	// NodeLabelSelector is scheduler filter node by node label.
39 | 	NodeLabelSelector map[string]string
40 | )
41 | 


--------------------------------------------------------------------------------
/pkg/scheduler/policy/constant.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package policy
18 | 
19 | type SchedulerPolicyName string
20 | 
21 | const (
22 | 	// NodeSchedulerPolicyBinpack is node use binpack scheduler policy.
23 | 	NodeSchedulerPolicyBinpack SchedulerPolicyName = "binpack"
24 | 	// NodeSchedulerPolicySpread is node use spread scheduler policy.
25 | 	NodeSchedulerPolicySpread SchedulerPolicyName = "spread"
26 | 	// GPUSchedulerPolicyBinpack is GPU use binpack scheduler.
27 | 	GPUSchedulerPolicyBinpack SchedulerPolicyName = "binpack"
28 | 	// GPUSchedulerPolicySpread is GPU use spread scheduler.
29 | 	GPUSchedulerPolicySpread SchedulerPolicyName = "spread"
30 | )
31 | 
32 | func (s SchedulerPolicyName) String() string {
33 | 	return string(s)
34 | }
35 | 
36 | const (
37 | 	// NodeSchedulerPolicyAnnotationKey is user set Pod annotation to change this default node policy.
38 | 	NodeSchedulerPolicyAnnotationKey = "hami.io/node-scheduler-policy"
39 | 	// GPUSchedulerPolicyAnnotationKey is user set Pod annotation to change this default GPU policy.
40 | 	GPUSchedulerPolicyAnnotationKey = "hami.io/gpu-scheduler-policy"
41 | )
42 | 
43 | const (
44 | 	Weight int = 10
45 | )
46 | 


--------------------------------------------------------------------------------
/pkg/scheduler/policy/gpu_policy.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package policy
18 | 
19 | import (
20 | 	"github.com/Project-HAMi/HAMi/pkg/util"
21 | 
22 | 	"k8s.io/klog/v2"
23 | )
24 | 
25 | type DeviceListsScore struct {
26 | 	Device *util.DeviceUsage
27 | 	// Score recode every device user/allocate score
28 | 	Score float32
29 | }
30 | 
31 | type DeviceUsageList struct {
32 | 	DeviceLists []*DeviceListsScore
33 | 	Policy      string
34 | }
35 | 
36 | func (l DeviceUsageList) Len() int {
37 | 	return len(l.DeviceLists)
38 | }
39 | 
40 | func (l DeviceUsageList) Swap(i, j int) {
41 | 	l.DeviceLists[i], l.DeviceLists[j] = l.DeviceLists[j], l.DeviceLists[i]
42 | }
43 | 
44 | func (l DeviceUsageList) Less(i, j int) bool {
45 | 	if l.Policy == util.GPUSchedulerPolicyBinpack.String() {
46 | 		if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa {
47 | 			return l.DeviceLists[i].Score < l.DeviceLists[j].Score
48 | 		}
49 | 		return l.DeviceLists[i].Device.Numa > l.DeviceLists[j].Device.Numa
50 | 	}
51 | 	// default policy is spread
52 | 	if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa {
53 | 		return l.DeviceLists[i].Score > l.DeviceLists[j].Score
54 | 	}
55 | 	return l.DeviceLists[i].Device.Numa < l.DeviceLists[j].Device.Numa
56 | }
57 | 
58 | func (ds *DeviceListsScore) ComputeScore(requests util.ContainerDeviceRequests) {
59 | 	request, core, mem := int32(0), int32(0), int32(0)
60 | 	// Here we are required to use the same type device
61 | 	for _, container := range requests {
62 | 		request += container.Nums
63 | 		core += container.Coresreq
64 | 		if container.MemPercentagereq != 0 && container.MemPercentagereq != 101 {
65 | 			mem += ds.Device.Totalmem * (container.MemPercentagereq / 100.0)
66 | 			continue
67 | 		}
68 | 		mem += container.Memreq
69 | 	}
70 | 	klog.V(2).Infof("device %s user %d, userCore %d, userMem %d,", ds.Device.ID, ds.Device.Used, ds.Device.Usedcores, ds.Device.Usedmem)
71 | 
72 | 	usedScore := float32(request+ds.Device.Used) / float32(ds.Device.Count)
73 | 	coreScore := float32(core+ds.Device.Usedcores) / float32(ds.Device.Totalcore)
74 | 	memScore := float32(mem+ds.Device.Usedmem) / float32(ds.Device.Totalmem)
75 | 	ds.Score = float32(Weight) * (usedScore + coreScore + memScore)
76 | 	klog.V(2).Infof("device %s computer score is %f", ds.Device.ID, ds.Score)
77 | }
78 | 


--------------------------------------------------------------------------------
/pkg/util/client/options.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package client
18 | 
19 | import (
20 | 	"time"
21 | 
22 | 	"k8s.io/client-go/rest"
23 | )
24 | 
25 | // Option defines a function type for client configuration options.
26 | type Option func(*rest.Config)
27 | 
28 | // Now we use the default values of kubernetes client, unless HAMi has specific requirements.
29 | const (
30 | 	DefaultQPS     float32 = rest.DefaultQPS
31 | 	DefaultBurst   int     = rest.DefaultBurst
32 | 	DefaultTimeout int     = 0 // seconds, 0 means no timeout, follow the default behavior of kubernetes client.
33 | )
34 | 
35 | // WithQPS sets the QPS for the client.
36 | func WithQPS(qps float32) Option {
37 | 	return func(c *rest.Config) {
38 | 		c.QPS = qps
39 | 	}
40 | }
41 | 
42 | // WithBurst sets the burst for the client.
43 | func WithBurst(burst int) Option {
44 | 	return func(c *rest.Config) {
45 | 		c.Burst = burst
46 | 	}
47 | }
48 | 
49 | // WithTimeout sets the timeout for the client.
50 | func WithTimeout(timeout int) Option {
51 | 	return func(c *rest.Config) {
52 | 		c.Timeout = time.Duration(timeout) * time.Second
53 | 	}
54 | }
55 | 
56 | // WithDefaults sets default values for the client configuration.
57 | func WithDefaults() Option {
58 | 	return func(c *rest.Config) {
59 | 		if c.QPS == 0 {
60 | 			c.QPS = DefaultQPS
61 | 		}
62 | 		if c.Burst == 0 {
63 | 			c.Burst = DefaultBurst
64 | 		}
65 | 		if c.Timeout == 0 {
66 | 			c.Timeout = time.Duration(DefaultTimeout) * time.Second
67 | 		}
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/pkg/util/client/testdata/invalid_kubeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # testdata/invalid_kubeconfig.yaml
 2 | apiVersion: v1
 3 | kind: Config
 4 | clusters:
 5 |   - cluster:
 6 |     # Missing server field or invalid URL
 7 |     # server: http://invalid-url/
 8 |     name: broken-cluster
 9 |     insecure-skip-tls-verify: true
10 | users:
11 |   - name: broken-user
12 |     user:
13 |       # Invalid or missing token
14 |       token: not-a-valid-token
15 | contexts:
16 |   - context:
17 |       cluster: non-existent-cluster
18 |       user: non-existent-user
19 |     name: broken-context
20 | # Missing current-context


--------------------------------------------------------------------------------
/pkg/util/client/testdata/kubeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # testdata/kubeconfig.yaml
 2 | apiVersion: v1
 3 | kind: Config
 4 | clusters:
 5 |   - cluster:
 6 |       server: https://example.com
 7 |       insecure-skip-tls-verify: true
 8 |     name: example-cluster
 9 | users:
10 |   - name: example-user
11 |     user:
12 |       token: my-token-value
13 | contexts:
14 |   - context:
15 |       cluster: example-cluster
16 |       user: example-user
17 |     name: example-context
18 | current-context: example-context


--------------------------------------------------------------------------------
/pkg/util/flag/flags.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | 	http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package flag
18 | 
19 | import (
20 | 	"github.com/spf13/pflag"
21 | 	"github.com/urfave/cli/v2"
22 | 	"k8s.io/klog/v2"
23 | )
24 | 
25 | func PrintPFlags(flags *pflag.FlagSet) {
26 | 	flags.VisitAll(func(flag *pflag.Flag) {
27 | 		klog.Infof("FLAG: --%s=%q", flag.Name, flag.Value)
28 | 	})
29 | }
30 | 
31 | func PrintCliFlags(c *cli.Context) {
32 | 	for _, flag := range c.App.Flags {
33 | 		names := flag.Names()
34 | 		for _, name := range names {
35 | 			value := c.Generic(name)
36 | 			klog.Infof("FLAG: --%s=%q\n", name, value)
37 | 		}
38 | 
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/pkg/util/flag/flags_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2024 The HAMi Authors.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package flag
 18 | 
 19 | import (
 20 | 	"bytes"
 21 | 	"flag"
 22 | 	"strings"
 23 | 	"testing"
 24 | 
 25 | 	"github.com/spf13/pflag"
 26 | 	"github.com/urfave/cli/v2"
 27 | 	"k8s.io/klog/v2"
 28 | )
 29 | 
 30 | func TestPrintPFlags(t *testing.T) {
 31 | 	var buf bytes.Buffer
 32 | 	klog.SetOutput(&buf)
 33 | 	klog.LogToStderr(false)
 34 | 	defer klog.LogToStderr(true)
 35 | 	tests := []struct {
 36 | 		name     string
 37 | 		flags    func() *pflag.FlagSet
 38 | 		expected string
 39 | 	}{
 40 | 		{
 41 | 			name: "Test with name flags",
 42 | 			flags: func() *pflag.FlagSet {
 43 | 				fs := pflag.NewFlagSet("test", pflag.ContinueOnError)
 44 | 				fs.String("name", "bob", "set name")
 45 | 				return fs
 46 | 			},
 47 | 			expected: `FLAG: --name="bob"`,
 48 | 		},
 49 | 	}
 50 | 
 51 | 	for _, tt := range tests {
 52 | 		t.Run(tt.name, func(t *testing.T) {
 53 | 			buf.Reset()
 54 | 			PrintPFlags(tt.flags())
 55 | 			if got := buf.String(); !strings.Contains(got, tt.expected) {
 56 | 				t.Errorf("PrintPFlags() = %q, want %q", got, tt.expected)
 57 | 			}
 58 | 		})
 59 | 	}
 60 | }
 61 | 
 62 | func TestPrintCliFlags(t *testing.T) {
 63 | 	var buf bytes.Buffer
 64 | 	klog.SetOutput(&buf)
 65 | 	klog.LogToStderr(false)
 66 | 	defer klog.LogToStderr(true)
 67 | 
 68 | 	tests := []struct {
 69 | 		name     string
 70 | 		cliCtx   func() *cli.Context
 71 | 		expected string
 72 | 	}{
 73 | 		{
 74 | 			name: "Test with name flag",
 75 | 			cliCtx: func() *cli.Context {
 76 | 				app := &cli.App{
 77 | 					Flags: []cli.Flag{
 78 | 						&cli.StringFlag{
 79 | 							Name:  "name",
 80 | 							Value: "bob",
 81 | 							Usage: "set user name",
 82 | 						},
 83 | 					},
 84 | 				}
 85 | 				flagSet := flag.NewFlagSet("test", flag.ContinueOnError)
 86 | 				flagSet.String("name", "bob", "")
 87 | 				return cli.NewContext(app, flagSet, nil)
 88 | 			},
 89 | 			expected: `FLAG: --name="bob"
 90 | `,
 91 | 		},
 92 | 	}
 93 | 
 94 | 	for _, tt := range tests {
 95 | 		t.Run(tt.name, func(t *testing.T) {
 96 | 			buf.Reset()
 97 | 			PrintCliFlags(tt.cliCtx())
 98 | 			got := buf.String()
 99 | 			if !strings.Contains(got, tt.expected) {
100 | 				t.Errorf("PrintCliFlags() output = %q, want %q", got, tt.expected)
101 | 			}
102 | 		})
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/pkg/version/version.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package version
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"github.com/spf13/cobra"
23 | )
24 | 
25 | var (
26 | 	version    string
27 | 	VersionCmd = &cobra.Command{
28 | 		Use:   "version",
29 | 		Short: "print version",
30 | 		Run: func(cmd *cobra.Command, args []string) {
31 | 			fmt.Println(Version())
32 | 		},
33 | 	}
34 | )
35 | 
36 | func Version() string {
37 | 	return version
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/version/version_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package version
18 | 
19 | import (
20 | 	"bytes"
21 | 	"io"
22 | 	"os"
23 | 	"testing"
24 | 
25 | 	"gotest.tools/v3/assert"
26 | )
27 | 
28 | func TestVersion(t *testing.T) {
29 | 	version = "v1.0.0.1234567890"
30 | 	versionWant := "v1.0.0.1234567890\n"
31 | 
32 | 	var out bytes.Buffer
33 | 	r, w, err := os.Pipe()
34 | 	if err != nil {
35 | 		t.Fatalf("os.Pipe() failed: %v", err)
36 | 	}
37 | 	defer r.Close()
38 | 	originalStdout := os.Stdout
39 | 	defer func() {
40 | 		os.Stdout = originalStdout
41 | 		w.Close()
42 | 	}()
43 | 	os.Stdout = w
44 | 
45 | 	VersionCmd.Run(nil, nil)
46 | 	w.Close()
47 | 
48 | 	io.Copy(&out, r)
49 | 
50 | 	versionGet := out.String()
51 | 	assert.Equal(t, versionWant, versionGet)
52 | }
53 | 


--------------------------------------------------------------------------------
/test/e2e/node/test_suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package e2e
18 | 
19 | import (
20 | 	"flag"
21 | 	"testing"
22 | 
23 | 	"github.com/onsi/ginkgo/v2"
24 | 	"github.com/onsi/gomega"
25 | )
26 | 
27 | func init() {
28 | 	testing.Init()
29 | 	flag.Parse()
30 | }
31 | 
32 | func TestInit(t *testing.T) {
33 | 	gomega.RegisterFailHandler(ginkgo.Fail)
34 | 	ginkgo.RunSpecs(t, "Test workspace Service Suite")
35 | }
36 | 


--------------------------------------------------------------------------------
/test/e2e/pod/test_suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package e2e
18 | 
19 | import (
20 | 	"flag"
21 | 	"testing"
22 | 
23 | 	"github.com/onsi/ginkgo/v2"
24 | 	"github.com/onsi/gomega"
25 | )
26 | 
27 | func init() {
28 | 	testing.Init()
29 | 	flag.Parse()
30 | }
31 | 
32 | func TestInit(t *testing.T) {
33 | 	gomega.RegisterFailHandler(ginkgo.Fail)
34 | 	ginkgo.RunSpecs(t, "Test pod")
35 | }
36 | 


--------------------------------------------------------------------------------
/test/e2e/test_suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package e2e
18 | 
19 | import (
20 | 	"flag"
21 | 	"testing"
22 | 
23 | 	"github.com/onsi/ginkgo/v2"
24 | 	"github.com/onsi/gomega"
25 | 
26 | 	"github.com/Project-HAMi/HAMi/test/utils"
27 | )
28 | 
29 | func init() {
30 | 	testing.Init()
31 | }
32 | 
33 | func TestInit(t *testing.T) {
34 | 	flag.Parse()
35 | 	utils.DefaultKubeConfigPath()
36 | 	gomega.RegisterFailHandler(ginkgo.Fail)
37 | 	ginkgo.RunSpecs(t, "HAMi E2E Test Suite")
38 | }
39 | 


--------------------------------------------------------------------------------
/test/utils/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package utils
18 | 
19 | // test data.
20 | const (
21 | 	GPUNodeLabelKey      = "gpu"
22 | 	GPUNodeLabelValue    = "on"
23 | 	GPUExecuteNvidiaSMI  = "nvidia-smi"
24 | 	GPUExecuteCudaSample = "/cuda-samples/sample"
25 | 	GPUPodMemory         = "300"
26 | 	GPUPodMemoryUnit     = "MiB"
27 | 	GPUPodCore           = "40"
28 | 	GPUNameSpace         = "hami-system"
29 | 	GPUNode              = "gpu-master"
30 | 	GPUCudaTestPass      = "Test PASSED"
31 | )
32 | 
33 | // hami related.
34 | const (
35 | 	HamiScheduler              = "hami-scheduler"
36 | 	HamiDevicePlugin           = "hami-device-plugin"
37 | 	ErrReasonFilteringFailed   = "FilteringFailed"
38 | 	ErrMessageFilteringFailed  = "no available node"
39 | 	ErrReasonFailedScheduling  = "FilteringFailed"
40 | 	ErrMessageFailedScheduling = "0/1 nodes are available"
41 | )
42 | 


--------------------------------------------------------------------------------
/test/utils/event.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package utils
18 | 
19 | import (
20 | 	"context"
21 | 	"fmt"
22 | 
23 | 	v1 "k8s.io/api/core/v1"
24 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 | 	"k8s.io/client-go/kubernetes"
26 | 	"k8s.io/klog/v2"
27 | )
28 | 
29 | func GetEvents(clientSet *kubernetes.Clientset, namespace string, listOptions metav1.ListOptions) ([]v1.Event, error) {
30 | 	events, err := clientSet.CoreV1().Events(namespace).List(context.TODO(), listOptions)
31 | 	if err != nil {
32 | 		return nil, err
33 | 	}
34 | 
35 | 	return events.Items, nil
36 | }
37 | 
38 | func GetPodEvents(clientSet *kubernetes.Clientset, namespace, podName string) ([]v1.Event, error) {
39 | 	listOption := metav1.ListOptions{
40 | 		FieldSelector: fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s", podName),
41 | 	}
42 | 
43 | 	events, err := GetEvents(clientSet, namespace, listOption)
44 | 	if err != nil {
45 | 		klog.Errorf("Failed to list events for pod %s in namespace %s: %v", podName, namespace, err)
46 | 		return nil, err
47 | 	}
48 | 
49 | 	return events, nil
50 | }
51 | 


--------------------------------------------------------------------------------
/test/utils/node.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The HAMi Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package utils
18 | 
19 | import (
20 | 	"context"
21 | 
22 | 	v1 "k8s.io/api/core/v1"
23 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24 | 	"k8s.io/client-go/kubernetes"
25 | 	"k8s.io/klog/v2"
26 | )
27 | 
28 | func GetNodes(clientSet *kubernetes.Clientset) (*v1.NodeList, error) {
29 | 	nodes, err := clientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
30 | 	if err != nil {
31 | 		klog.Errorf("Failed to get nodes: %v", err)
32 | 		return nil, err
33 | 	}
34 | 
35 | 	return nodes, nil
36 | }
37 | 
38 | func UpdateNode(clientSet *kubernetes.Clientset, node *v1.Node) (*v1.Node, error) {
39 | 	updatedNode, err := clientSet.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{})
40 | 	if err != nil {
41 | 		klog.Errorf("Failed to update node %s: %v", node.Name, err)
42 | 		return nil, err
43 | 	}
44 | 
45 | 	return updatedNode, nil
46 | }
47 | 
48 | func AddNodeLabel(clientSet *kubernetes.Clientset, nodeName, labelKey, labelValue string) (*v1.Node, error) {
49 | 	node, err := clientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
50 | 	if err != nil {
51 | 		return nil, err
52 | 	}
53 | 
54 | 	if node.Labels == nil {
55 | 		node.Labels = make(map[string]string)
56 | 	}
57 | 	node.Labels[labelKey] = labelValue
58 | 
59 | 	return UpdateNode(clientSet, node)
60 | }
61 | 
62 | func RemoveNodeLabel(clientSet *kubernetes.Clientset, nodeName, labelKey string) (*v1.Node, error) {
63 | 	node, err := clientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
64 | 	if err != nil {
65 | 		return nil, err
66 | 	}
67 | 
68 | 	if node.Labels != nil {
69 | 		delete(node.Labels, labelKey)
70 | 	}
71 | 
72 | 	return UpdateNode(clientSet, node)
73 | }
74 | 


--------------------------------------------------------------------------------
/version.mk:
--------------------------------------------------------------------------------
 1 | GO=go
 2 | GO111MODULE=on
 3 | CMDS=scheduler vGPUmonitor
 4 | DEVICES=nvidia
 5 | OUTPUT_DIR=bin
 6 | TARGET_ARCH=amd64
 7 | GOLANG_IMAGE=golang:1.22.5-bullseye
 8 | NVIDIA_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu20.04
 9 | DEST_DIR=/usr/local/vgpu/
10 | 
11 | VERSION = v0.0.1
12 | IMG_NAME =hami
13 | IMG_TAG="${IMG_NAME}:${VERSION}"


--------------------------------------------------------------------------------