├── .gitignore
├── .golangci.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DEV.md
├── Dockerfile.device-faker
├── Dockerfile.gaudi
├── Dockerfile.gaudi-test
├── Dockerfile.gpu
├── Dockerfile.qat
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── SECURITY.md
├── charts
├── intel-gaudi-resource-driver
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── README.md
│ ├── templates
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── clusterrole.yaml
│ │ ├── clusterrolebinding.yaml
│ │ ├── device-class.yaml
│ │ ├── nfd.yaml
│ │ ├── resource-driver-namespace.yaml
│ │ ├── resource-driver.yaml
│ │ ├── serviceaccount.yaml
│ │ ├── validating-admission-policy-binding.yaml
│ │ └── validating-admission-policy.yaml
│ └── values.yaml
├── intel-gpu-resource-driver
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── README.md
│ ├── templates
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── clusterrole.yaml
│ │ ├── clusterrolebinding.yaml
│ │ ├── device-class.yaml
│ │ ├── node-feature-rules.yaml
│ │ ├── resource-driver.yaml
│ │ ├── serviceaccount.yaml
│ │ ├── validating-admission-policy-binding.yaml
│ │ └── validating-admission-policy.yaml
│ └── values.yaml
└── intel-qat-resource-driver
│ ├── Chart.yaml
│ ├── README.md
│ ├── templates
│ ├── NOTES.txt
│ ├── _helpers.tpl
│ ├── clusterrole.yaml
│ ├── clusterrolebinding.yaml
│ ├── device-class.yaml
│ ├── nfd.yaml
│ ├── resource-driver-namespace.yaml
│ ├── resource-driver.yaml
│ ├── serviceaccount.yaml
│ ├── validating-admission-policy-binding.yaml
│ └── validating-admission-policy.yaml
│ └── values.yaml
├── cmd
├── cdi-specs-generator
│ └── main.go
├── device-faker
│ └── main.go
├── kubelet-gaudi-plugin
│ ├── driver.go
│ ├── driver_test.go
│ ├── healthcare.go
│ ├── healthcare_test.go
│ ├── main.go
│ ├── node_state.go
│ └── node_state_test.go
├── kubelet-gpu-plugin
│ ├── driver.go
│ ├── driver_test.go
│ ├── main.go
│ ├── node_state.go
│ ├── node_state_test.go
│ └── test-claims
│ │ ├── empty.json
│ │ ├── invalid.json
│ │ └── multi.json
├── kubelet-qat-plugin
│ ├── clientsets.go
│ ├── config.go
│ ├── deviceresources.go
│ ├── driver.go
│ ├── driver_test.go
│ └── main.go
└── qat-showdevice
│ └── main.go
├── deployments
├── gaudi
│ ├── base
│ │ ├── device-class.yaml
│ │ ├── kustomization.yaml
│ │ ├── namespace.yaml
│ │ └── resource-driver.yaml
│ ├── examples
│ │ ├── deployment-inline.yaml
│ │ ├── monitor-pod-inline.yaml
│ │ └── pod-inline.yaml
│ ├── kustomization.yaml
│ └── overlays
│ │ ├── device-faker
│ │ ├── device-faker.yaml
│ │ ├── kustomization.yaml
│ │ └── remove-sysfs.yaml
│ │ └── nfd_labeled_nodes
│ │ ├── add-nodeselector-intel-gaudi.yaml
│ │ ├── kustomization.yaml
│ │ └── nfd-intel-gaudi-device-rule.yaml
├── gpu
│ ├── base
│ │ ├── device-class.yaml
│ │ ├── kustomization.yaml
│ │ ├── namespace.yaml
│ │ └── resource-driver.yaml
│ ├── examples
│ │ ├── claim-external-gpu.yaml
│ │ ├── deployment-inline.yaml
│ │ ├── monitor-pod-inline.yaml
│ │ ├── pod-for-claim-external-gpu.yaml
│ │ └── pod-inline-gpu.yaml
│ ├── intel-xpumanager
│ │ ├── gpu-monitor-claim.yaml
│ │ ├── kustomization.yaml
│ │ ├── xpumd-add-dra-resource.yaml
│ │ └── xpumd-delete-limits.yaml
│ ├── kustomization.yaml
│ └── overlays
│ │ ├── device-faker
│ │ ├── device-faker.yaml
│ │ ├── kustomization.yaml
│ │ └── remove-sysfs.yaml
│ │ └── nfd_labeled_nodes
│ │ ├── add-nodeselector-intel-gpu.yaml
│ │ ├── kustomization.yaml
│ │ ├── nfd-intel-gpu-device-rule.yaml
│ │ └── nfd-intel-gpu-platform-labeling.yaml
└── qat
│ ├── base
│ ├── device-class.yaml
│ ├── kustomization.yaml
│ ├── namespace.yaml
│ └── resource-driver.yaml
│ ├── examples
│ ├── deployment-inline.yaml
│ └── intel-qat-resource-driver-configuration.yaml
│ ├── kustomization.yaml
│ ├── overlays
│ └── nfd_labeled_nodes
│ │ ├── add-nodeselector-intel-qat.yaml
│ │ ├── kustomization.yaml
│ │ └── nfd-intel-qat-device-rule.yaml
│ └── tests
│ ├── openssl-qat-engine
│ ├── kustomization.yaml
│ └── openssl-qat-engine.yaml
│ ├── qat-dpdk-test
│ ├── compress-perf.yaml
│ ├── crypto-perf.yaml
│ ├── file.txt
│ ├── kustomization.yaml
│ └── modified-cluster-setup.yaml
│ ├── qatlib-sample-code
│ ├── kustomization.yaml
│ └── qatlib-sample-code.yaml
│ └── resource-claim-template.yaml
├── doc
├── CLUSTER_SETUP.md
├── cdi-spec-generator
│ ├── BUILD.md
│ └── README.md
├── device-faker
│ └── README.md
├── gaudi
│ ├── BUILD.md
│ ├── README.md
│ └── USAGE.md
├── gpu
│ ├── BUILD.md
│ ├── README.md
│ ├── USAGE.md
│ ├── allocation-delayed.puml
│ ├── allocation-immediate.puml
│ ├── complete-overview.puml
│ ├── generate-pngs.sh
│ └── high-level-overview.puml
└── qat
│ ├── BUILD.md
│ ├── README.md
│ ├── TESTING.md
│ └── USAGE.md
├── gaudi.mk
├── go.mod
├── go.sum
├── gpu.mk
├── hack
├── boilerplate.go.txt
├── clusterconfig.yaml
├── fake_libhlml
│ ├── Makefile
│ ├── README.md
│ └── fake_libhlml.c
└── tools.go
├── pkg
├── fakehlml
│ ├── fake_hlml.go
│ └── fake_hlml.h
├── fakesysfs
│ ├── fakesysfs.go
│ ├── gaudi.go
│ ├── gpu.go
│ └── qat.go
├── gaudi
│ ├── cdihelpers
│ │ ├── cdihelpers.go
│ │ └── cdihelpers_test.go
│ ├── device
│ │ ├── device.go
│ │ └── device_test.go
│ └── discovery
│ │ ├── discovery.go
│ │ └── discovery_test.go
├── gpu
│ ├── cdihelpers
│ │ └── cdihelpers.go
│ ├── device
│ │ └── device.go
│ └── discovery
│ │ └── discovery.go
├── helpers
│ ├── device.go
│ ├── device_test.go
│ ├── driver.go
│ ├── helpers.go
│ ├── helpers_test.go
│ ├── node_state.go
│ └── node_state_test.go
├── plugintesthelpers
│ └── plugintesthelpers.go
├── qat
│ ├── cdi
│ │ └── cdi.go
│ └── device
│ │ ├── device.go
│ │ ├── device_test.go
│ │ └── state.go
└── version
│ └── version.go
├── qat.mk
└── test
└── e2e
├── dra_suite_test.go
├── qat
└── qat.go
└── utils
└── utils.go
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /vendor/
3 |
4 | # macOS
5 | .DS_Store
6 |
7 | # files generated by editors
8 | .idea/
9 | *.iml
10 | .vscode/
11 | *.swp
12 | *.sublime-project
13 | *.sublime-workspace
14 | *~
15 | *.o
16 | *.so
17 | *.out
18 |
--------------------------------------------------------------------------------
/.golangci.yaml:
--------------------------------------------------------------------------------
1 | # please keep this alphabetized
2 | linters:
3 | enable:
4 | - asciicheck
5 | - contextcheck
6 | - forcetypeassert
7 | - gocritic
8 | - godot
9 | - gofmt
10 | - goimports
11 | - misspell
12 | - stylecheck
13 | - gocyclo
14 |
15 | run:
16 | tests: true
17 | timeout: 1m
18 |
19 | linters-settings:
20 | gocyclo:
21 | min-complexity: 15
22 | goimports:
23 | local-prefixes: "github.com/intel/intel-resource-drivers-for-kubernetes"
24 | stylecheck:
25 | # default set minus ID - see https://golangci-lint.run/usage/linters/#stylecheck
26 | initialisms: ["ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS", "SIP", "RTP", "AMQP", "DB", "TS"]
27 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | ### License
4 |
5 | Intel Resource Drivers for Kubernetes is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
6 |
7 | ### Sign your work
8 |
9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 |
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 |
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 |
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 |
23 | Developer's Certificate of Origin 1.1
24 |
25 | By making a contribution to this project, I certify that:
26 |
27 | (a) The contribution was created in whole or in part by me and I
28 | have the right to submit it under the open source license
29 | indicated in the file; or
30 |
31 | (b) The contribution is based upon previous work that, to the best
32 | of my knowledge, is covered under an appropriate open source
33 | license and I have the right under that license to submit that
34 | work with modifications, whether created in whole or in part
35 | by me, under the same open source license (unless I am
36 | permitted to submit under a different license), as indicated
37 | in the file; or
38 |
39 | (c) The contribution was provided directly to me by some other
40 | person who certified (a), (b) or (c) and I have not modified
41 | it.
42 |
43 | (d) I understand and agree that this project and the contribution
44 | are public and that a record of the contribution (including all
45 | personal information I submit with it, including my sign-off) is
46 | maintained indefinitely and may be redistributed consistent with
47 | this project or the open source license(s) involved.
48 | ```
49 |
50 | Then you just add a line to every git commit message:
51 |
52 | Signed-off-by: Joe Smith
53 |
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 |
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 |
--------------------------------------------------------------------------------
/DEV.md:
--------------------------------------------------------------------------------
1 | Contents:
2 | * [Runtime](#runtime)
3 | * [Enable CDI in Containerd](#enable-cdi-in-containerd)
4 | * [Generated source code](#generated-source-code)
5 | * [Required tools](#required-tools)
6 |
7 |
8 | # Runtime
9 |
10 | Runtime needs to have CDI injection support
11 |
12 | - CRI-O: 1.23+, enabled by default.
13 | - Containerd: v1.7+, disabled by default.
14 |
15 | ## Enable CDI in Containerd
16 |
17 | Containerd config file should have `enable_cdi` and `cdi_specs_dir`. Example `/etc/containerd/config.toml`:
18 | ```
19 | version = 2
20 | [plugins]
21 | [plugins."io.containerd.grpc.v1.cri"]
22 | enable_cdi = true
23 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"]
24 | ```
25 |
26 | ### Determine your go binaries location from `go install --help`, quote:
27 | > Executables are installed in the directory named by the GOBIN environment
28 | > variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH
29 | > environment variable is not set. Executables in $GOROOT
30 | > are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN.
31 |
32 | ### Way 1 : install tools with Go:
33 |
34 | #### Add Go binaries directory to PATH
35 | Add this to the end of your `$HOME/.bashrc`:
36 | ```bash
37 | export PATH=":$PATH"
38 | ```
39 |
40 | #### install tools
41 | ```bash
42 | GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest
43 | GO111MODULE=on go install k8s.io/code-generator/cmd/client-gen@latest
44 | ```
45 |
46 | ### Way 2 : clone and build it:
47 | ```bash
48 | git clone https://github.com/kubernetes-sigs/controller-tools.git
49 | cd controller-tools
50 | go build ./cmd/controller-gen
51 | cd -
52 | git clone https://github.com/kubernetes/code-generator.git
53 | cd code-generator
54 | go build ./cmd/client-gen
55 | cd -
56 | ```
57 |
58 | Make them available in PATH, for instance $HOME/go/bin:
59 | ```bash
60 | cp controller-tools/controller-gen code-generator/client-gen $HOME/go/bin
61 | # ensure it's in the path. You may want to add export to $HOME/.bashrc
62 | echo $PATH | grep -q $HOME/go/bin || export PATH=$HOME/go/bin:$PATH
63 | ```
64 | # Running tests
65 |
66 | Since Q2 '25 Gaudi DRA driver uses `gohlml` to retrieve health-related information.
67 | There is a hardcoded path to the HLML shared library, and `hack/fake_libhlml` was created based
68 | on the `hlml.h` from `gohlml` project - it is effectively a stub / mock with flow control support.
69 |
70 | When health-related tests call `gohlml` - it should in turn call fake `libhlml`, instead of the real
71 | one, on the nodes where there is no real Gaudi HW and SW installed (e.g. CI). This means, if the
72 | tests are run on your development machine - you should either deploy fresh fake `libhlml.so`, or
73 | run tests in a `gaudi-dra-driver-test-image` container like CI does.
74 |
75 | Deploying fake hlml instead of real `libhlml` should allow running tests in VSCode and other IDEs,
76 | after `ldconfig` is [configured properly](hack/fake_libhlml/README.md)
77 |
78 | ## Deploying
79 | ```shell
80 | $ cd hack/fake_libhlml
81 | $ make clean
82 | rm -f fake_libhlml.o fake_libhlml.so
83 | $ make
84 | gcc -O -Wall -Wextra -Wno-unused-parameter -fPIC -c fake_libhlml.c -o fake_libhlml.o
85 | gcc -shared -o fake_libhlml.so fake_libhlml.o
86 | $ sudo cp ./fake_libhlml.so /usr/lib/habanalabs/libhlml.so
87 | $ cat << EOF | sudo tee /etc/ld.so.conf.d/habanalabs.conf
88 | /usr/lib/habanalabs/
89 | EOF
90 |
91 | $ sudo ldconfig
92 | ```
93 |
94 | ## Running tests in container
95 |
96 | To have your own user ID inside container image without access / permission issues, build a fresh
97 | container image, then run tests. The CI uses its own user ID.
98 |
99 | ```shell
100 | $ make test-image
101 | $ make test-containerized
102 | ```
103 |
104 | Tests provide coverage data. If you need to see the coverage report, just run Make target for needed
105 | coverage target, e.g.
106 |
107 | ```
108 | make gaudi-coverage
109 | ```
110 |
--------------------------------------------------------------------------------
/Dockerfile.device-faker:
--------------------------------------------------------------------------------
1 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build
2 | ARG LOCAL_LICENSES
3 | WORKDIR /build
4 | COPY . .
5 |
6 | RUN make bin/device-faker && \
7 | mkdir -p /install_root && \
8 | if [ -z "$LOCAL_LICENSES" ]; then \
9 | make licenses; \
10 | fi && \
11 | cp -r licenses /install_root/ && \
12 | cp bin/device-faker /install_root/
13 |
14 |
15 | FROM alpine AS template
16 | COPY --from=build /install_root/device-faker /device-faker
17 |
18 |
19 | RUN mkdir -p /opt/templates && \
20 | /device-faker gpu -n && \
21 | mv /tmp/gpu-template-*.json /opt/templates/gpu-template.json && \
22 | /device-faker gaudi -n && \
23 | mv /tmp/gaudi-template-*.json /opt/templates/gaudi-template.json && \
24 | chmod 644 /opt/templates/*.json
25 |
26 | FROM scratch
27 | LABEL description="Intel Device Faker"
28 | COPY --from=build /install_root/device-faker /device-faker
29 | COPY --from=template /opt/templates /opt/templates
30 | ENTRYPOINT ["/device-faker"]
31 |
--------------------------------------------------------------------------------
/Dockerfile.gaudi:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ARG HTTP_PROXY
16 | ARG HTTPS_PROXY
17 | ARG NO_PROXY
18 |
19 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b as build
20 | ARG LOCAL_LICENSES
21 | WORKDIR /build
22 | COPY . .
23 |
24 | # install libhlml.so
25 | RUN \
26 | export http_proxy=${HTTP_PROXY} https_proxy=${HTTPS_PROXY} no_proxy=${NO_PROXY} && \
27 | curl -fsSL https://vault.habana.ai/artifactory/api/gpg/key/public | gpg --dearmor | tee /etc/apt/trusted.gpg.d/habanalabs.gpg > /dev/null && \
28 | wget -q -O /etc/apt/sources.list.d/habanalabs_synapseai.list "https://vault.habana.ai/artifactory/gaudi-installer/repos/1.16.2/debian10.10/habanalabs_synapseai.list" > /dev/null && \
29 | apt-get update && \
30 | apt-get download habanalabs-firmware-tools && \
31 | ls -al && \
32 | dpkg --force-all -i *.deb
33 |
34 | RUN make gaudi && \
35 | mkdir -p /install_root && \
36 | if [ -z "$LOCAL_LICENSES" ]; then \
37 | make licenses; \
38 | fi && \
39 | cp -r licenses /install_root/ && \
40 | mkdir /install_root/licenses/habanalabs && \
41 | cp /usr/share/doc/habanalabs-firmware-tools/* /install_root/licenses/habanalabs/ && \
42 | cp bin/kubelet-gaudi-plugin /install_root/
43 |
44 | # Get libc and sources from Ubuntu24, libhlml needs GLIBC_2.38
45 | FROM ubuntu:24.04@sha256:80dd3c3b9c6cecb9f1667e9290b3bc61b78c2678c02cbdae5f0fea92cc6734ab as ubuntu
46 | RUN \
47 | cat /etc/apt/sources.list.d/ubuntu.sources && \
48 | sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources && \
49 | apt-get update && \
50 | apt-get install -y dpkg-dev && \
51 | mkdir /tmp/src && \
52 | cd /tmp/src && \
53 | apt-get source libc6 coreutils dash
54 |
55 | FROM scratch
56 | LABEL description="Intel Gaudi resource driver for Kubernetes"
57 |
58 | COPY --from=build /install_root /
59 | COPY --from=build /usr/lib/habanalabs/libhlml.so /usr/lib/habanalabs/libhlml.so
60 | COPY --from=ubuntu /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/libc.so.6
61 | COPY --from=ubuntu /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2
62 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6
63 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/libdl.so.2
64 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libz.so.1 /usr/lib/x86_64-linux-gnu/libz.so.1
65 | COPY --from=ubuntu /bin/cat /bin/cat
66 | COPY --from=ubuntu /bin/sh /bin/sh
67 | COPY --from=ubuntu /tmp/src/*tar.xz /src/
68 |
69 | ENV LD_LIBRARY_PATH=/usr/lib/habanalabs:/lib/x86_64-linux-gnu:/lib64:/usr/lib/x86_64-linux-gnu
70 | ENV PATH=/bin
71 |
--------------------------------------------------------------------------------
/Dockerfile.gaudi-test:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b as build
15 | WORKDIR /build
16 | COPY . .
17 |
18 | RUN cd hack/fake_libhlml && \
19 | make clean && make
20 |
21 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b
22 | ARG UID=1001
23 | ARG GID=1001
24 |
25 | COPY --from=build /build/hack/fake_libhlml/fake_libhlml.so /usr/lib/habanalabs/libhlml.so
26 |
27 | RUN \
28 | echo "existing user: $(id $UID)" && \
29 | groupadd -g ${GID} ubuntu && \
30 | useradd -m -g ${GID} -u ${UID} -s /bin/bash ubuntu && \
31 | mkdir /github && \
32 | chmod 777 /github
33 |
34 | RUN \
35 | mkdir -m 755 /home/ubuntu/.cache/ && \
36 | mkdir -m 755 /home/ubuntu/.cache/go-build && \
37 | mkdir -m 755 /home/ubuntu/.cache/go-mod && \
38 | chown -R ubuntu:ubuntu /home/ubuntu/.cache && \
39 | mkdir /home/ubuntu/src && \
40 | git config --global --add safe.directory /home/ubuntu/src
41 |
42 | ENV GOCACHE=/home/ubuntu/.cache/go-build
43 | ENV GOMODCACHE=/home/ubuntu/.cache/go-mod
44 |
45 | USER ubuntu
46 | WORKDIR /home/ubuntu
47 |
--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build
16 | ARG LOCAL_LICENSES
17 | WORKDIR /build
18 | COPY . .
19 |
20 | RUN make gpu && \
21 | mkdir -p /install_root && \
22 | if [ -z "$LOCAL_LICENSES" ]; then \
23 | make licenses; \
24 | fi && \
25 | cp -r licenses /install_root/ && \
26 | cp bin/kubelet-gpu-plugin /install_root/
27 |
28 | FROM scratch
29 | WORKDIR /
30 | LABEL description="Intel GPU resource driver for Kubernetes"
31 |
32 | COPY --from=build /install_root /
33 |
--------------------------------------------------------------------------------
/Dockerfile.qat:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build
16 | ARG LOCAL_LICENSES
17 | WORKDIR /build
18 | COPY . .
19 |
20 | RUN make qat && \
21 | mkdir -p /install_root && \
22 | if [ -z "$LOCAL_LICENSES" ]; then \
23 | make licenses; \
24 | fi && \
25 | cp -r licenses /install_root/ && \
26 | cp bin/kubelet-qat-plugin /install_root/ && \
27 | cp bin/qat-showdevice /install_root/
28 |
29 |
30 | FROM scratch
31 | WORKDIR /
32 | LABEL description="Intel QAT resource driver for Kubernetes"
33 |
34 | COPY --from=build /install_root /
35 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | These contents may have been developed with support from one or more Intel-operated generative artificial intelligence solutions.
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intel resource drivers for Kubernetes
2 |
3 | CAUTION: This is an beta / non-production software, do not use on production clusters.
4 |
5 | ## This repository containes following resource drivers:
6 |
7 | - [GPU](doc/gpu/README.md)
8 | - [Gaudi](doc/gaudi/README.md)
9 | - [QAT](doc/qat/README.md)
10 |
11 | ## Glossary
12 |
13 | - DRA https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation
14 | - CDI https://github.com/cncf-tags/container-device-interface/
15 | - K8s https://github.com/kubernetes/kubernetes.git
16 |
17 | ## About resource drivers
18 |
19 | Intel resource drivers for Kubernetes is an alternative for
20 | [Intel device plugins](https://github.com/intel/intel-device-plugins-for-kubernetes/),
21 | facilitating workload offloading by providing accelerator access on Kubernetes cluster worker nodes.
22 |
23 | Resource drivers are not Linux kernel mode drivers (KMD), and do not help the operational system on
24 | the worker nodes detect and operate the accelerators.
25 |
26 | The resource drivers are based on Dynamic Resource Allocation (DRA) framework in Kubernetes
27 |
28 | ### About Dynamic Resource Allocation
29 |
30 | Dynamic Resource Allocation (DRA) is a resource management framework in Kubernetes (1.26+), that
31 | allows management of special resources in cluster (typically HW accelerators) by vendor-provided
32 | resource drivers (typically a controller and a node-agent / kubelet-plugin) in a common way.
33 |
34 | Resource drivers are meant to handle discovery, allocation, accounting of specific resources as well
35 | as their preparation for Pod before Pod startup, and cleanup after the Pod has completed successfully
36 | and the resource is no longer needed. More info is
37 | [in the KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation)
38 |
39 |
40 | ## Release process
41 |
42 | Every resource driver in this repository has its own releases, release branches and version tags.
43 |
44 | Typical release cadence is quarterly. During the release creation the project's documentation,
45 | deployment files etc. will be changed to point to the newly created version.
46 |
47 | Once the content is available in the main branch and validation PASSes, release branch will be
48 | created (e.g. gpu-release-v0.2.0). The HEAD of release branch will also be tagged with the corresponding
49 | tag (e.g. gpu-v0.2.0).
50 |
51 | During the release creation, the project's documentation, deployment files etc. will be changed to
52 | point to the newly created version.
53 |
54 | Patch releases (e.g. gaudi-v0.1.1) are done on a need basis if there are security issues or minor fixes
55 | for specific supported version. Fixes are always cherry-picked from the main branch to the release
56 | branches.
57 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and
3 | providing clear guidance on the solution, impact, severity and mitigation.
4 |
5 | ## Reporting a Vulnerability
6 | Please report any security vulnerabilities in this project
7 | [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
8 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | # Common backup files
9 | *.swp
10 | *.bak
11 | *.tmp
12 | *.orig
13 | *~
14 | # Various IDEs
15 | .project
16 | .idea/
17 | *.tmproj
18 | .vscode/
19 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: intel-gaudi-resource-driver
3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel Gaudi Resource Driver
4 |
5 | type: application
6 | version: 0.3.0
7 | appVersion: "v0.3.0"
8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts
9 |
10 | dependencies:
11 | - name: node-feature-discovery
12 | alias: nfd
13 | version: "0.17.1"
14 | condition: nfd.enabled
15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
16 |
17 | annotations:
18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
20 | org.opencontainers.image.version: "0.3.0"
21 | org.opencontainers.image.title: "Intel Gaudi Resource Driver"
22 | org.opencontainers.image.description: "This chart installs the Intel Gaudi resource driver on Kubernetes."
23 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Resource Allocation (DRA) Intel Gaudi Driver Helm Chart
2 |
3 | ## The chart installs Gaudi resource driver:
4 |
5 | - [Gaudi](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gaudi/README.md)
6 |
7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main)
8 |
9 |
10 | ## Installing the chart
11 |
12 | ```
13 | helm install intel-gaudi-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver \
14 | --create-namespace \
15 | --namespace intel-gaudi-resource-driver
16 | ```
17 |
18 | ## Uninstalling the chart
19 | ```
20 | helm uninstall intel-gaudi-resource-driver --namespace intel-gaudi-resource-driver
21 | ```
22 | (Optional) Delete the namespace:
23 | ```
24 | kubectl delete ns intel-gaudi-resource-driver
25 | ```
26 |
27 | ## Configuration
28 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
29 |
30 | ```
31 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver
32 | ```
33 |
34 | You may also run `helm show values` on this chart's dependencies for additional options.
35 |
36 | | Key | Type | Default |
37 | |-----|------|---------|
38 | | image.repository | string | `intel` |
39 | | image.name | string | `"intel-gaudi-resource-driver"` |
40 | | image.pullPolicy | string | `"IfNotPresent"` |
41 | | image.tag | string | `"v0.3.0"` |
42 |
43 | > [!Note]
44 | > If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases.
45 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Thank you for installing {{ .Chart.Name }}.
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* Define common helpers */}}
2 | {{- define "intel-gaudi-resource-driver.chart" -}}
3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
4 | {{- end }}
5 |
6 | {{/* Define the base name for the driver */}}
7 | {{- define "intel-gaudi-resource-driver.baseName" -}}
8 | intel-gaudi-resource-driver
9 | {{- end }}
10 |
11 | {{/* Specific helpers */}}
12 | {{- define "intel-gaudi-resource-driver.name" -}}
13 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
14 | {{- end }}
15 |
16 | {{/* Create a default fully qualified app name */}}
17 | {{- define "intel-gaudi-resource-driver.fullname" -}}
18 | {{- if .Values.fullnameOverride -}}
19 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
20 | {{- else -}}
21 | {{- printf "%s-%s" (include "intel-gaudi-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}}
22 | {{- end -}}
23 | {{- end }}
24 |
25 | {{- define "intel-gaudi-resource-driver.namespace" -}}
26 | {{- default .Release.Namespace .Values.namespaceOverride }}
27 | {{- end }}
28 |
29 | {{/* Labels for templates */}}
30 | {{- define "intel-gaudi-resource-driver.labels" -}}
31 | helm.sh/chart: {{ include "intel-gaudi-resource-driver.chart" . }}
32 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
33 | app.kubernetes.io/managed-by: {{ .Release.Service }}
34 | {{- end }}
35 |
36 | {{- define "intel-gaudi-resource-driver.clusterRoleName" -}}
37 | {{- printf "%s-role" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
38 | {{- end }}
39 |
40 | {{- define "intel-gaudi-resource-driver.clusterRoleBindingName" -}}
41 | {{- printf "%s-rolebinding" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
42 | {{- end }}
43 |
44 | {{- define "intel-gaudi-resource-driver.serviceAccountName" -}}
45 | {{- if .Values.serviceAccount.create -}}
46 | {{- default "intel-gaudi-sa" .Values.serviceAccount.name -}}
47 | {{- end -}}
48 | {{- end }}
49 |
50 | {{/* Define full image name */}}
51 | {{- define "intel-gaudi-resource-driver.fullimage" -}}
52 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}}
53 | {{- end }}
54 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }}
5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }}
6 | rules:
7 | - apiGroups: [""]
8 | resources: ["nodes"]
9 | verbs: ["get"]
10 | - apiGroups: ["resource.k8s.io"]
11 | resources: ["resourceslices"]
12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
13 | - apiGroups: ["resource.k8s.io"]
14 | resources: ["resourceclaims"]
15 | verbs: ["get"]
16 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleBindingName" . }}
5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }}
6 | subjects:
7 | - kind: ServiceAccount
8 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }}
9 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }}
10 | roleRef:
11 | kind: ClusterRole
12 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }}
13 | apiGroup: rbac.authorization.k8s.io
14 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: gaudi.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "gaudi.intel.com"
10 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/nfd.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.nfd.enabled }}
2 | apiVersion: nfd.k8s-sigs.io/v1alpha1
3 | kind: NodeFeatureRule
4 | metadata:
5 | name: intel-gaudi-device-rule
6 | spec:
7 | rules:
8 | - name: "intel.gaudi"
9 | labels:
10 | "intel.feature.node.kubernetes.io/gaudi": "true"
11 | matchFeatures:
12 | - feature: pci.device
13 | matchExpressions:
14 | vendor: {op: In, value: ["1da3"]}
15 | device: {op: In, value: ["1020", "1030"]}
16 | {{- end }}
17 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/resource-driver-namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: intel-gaudi-resource-driver
5 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gaudi-resource-driver-kubelet-plugin
5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }}
6 | labels:
7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }}
8 | spec:
9 | selector:
10 | matchLabels:
11 | app: intel-gaudi-resource-driver-kubelet-plugin
12 | template:
13 | metadata:
14 | labels:
15 | app: intel-gaudi-resource-driver-kubelet-plugin
16 | spec:
17 | serviceAccount: intel-gaudi-resource-driver-service-account
18 | serviceAccountName: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }}
19 | containers:
20 | - name: kubelet-plugin
21 | image: {{ include "intel-gaudi-resource-driver.fullimage" . }}
22 | imagePullPolicy: {{ .Values.image.pullPolicy }}
23 | command: ["/kubelet-gaudi-plugin"]
24 | env:
25 | - name: NODE_NAME
26 | valueFrom:
27 | fieldRef:
28 | fieldPath: spec.nodeName
29 | - name: POD_NAMESPACE
30 | valueFrom:
31 | fieldRef:
32 | fieldPath: metadata.namespace
33 | - name: SYSFS_ROOT
34 | value: "/sysfs"
35 | volumeMounts:
36 | - name: plugins-registry
37 | mountPath: /var/lib/kubelet/plugins_registry
38 | - name: plugins
39 | mountPath: /var/lib/kubelet/plugins
40 | - name: cdi
41 | mountPath: /etc/cdi
42 | - name: varruncdi
43 | mountPath: /var/run/cdi
44 | # when using fake sysfs - mount at the same place as on host
45 | - name: sysfs
46 | mountPath: "/sysfs"
47 | securityContext:
48 | privileged: false
49 | allowPrivilegeEscalation: false
50 | capabilities:
51 | drop: ["ALL"]
52 | readOnlyRootFilesystem: true
53 | runAsUser: 0
54 | seccompProfile:
55 | type: RuntimeDefault
56 | volumes:
57 | - name: plugins-registry
58 | hostPath:
59 | path: /var/lib/kubelet/plugins_registry
60 | - name: plugins
61 | hostPath:
62 | path: /var/lib/kubelet/plugins
63 | - name: cdi
64 | hostPath:
65 | path: /etc/cdi
66 | - name: varruncdi
67 | hostPath:
68 | path: /var/run/cdi
69 | - name: sysfs
70 | hostPath:
71 | path: /sys
72 | {{- with .Values.kubeletPlugin.tolerations }}
73 | tolerations:
74 | {{- toYaml . | nindent 8 }}
75 | {{- end }}
76 | {{- if .Values.nfd.enabled }}
77 | nodeSelector:
78 | intel.feature.node.kubernetes.io/gaudi: "true"
79 | {{- else }}
80 | {{- with .Values.kubeletPlugin.nodeSelector }}
81 | nodeSelector:
82 | {{- toYaml . | nindent 8 }}
83 | {{- end }}
84 | {{- end }}
85 | {{- with .Values.kubeletPlugin.affinity }}
86 | affinity:
87 | {{- toYaml . | nindent 8 }}
88 | {{- end }}
89 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }}
5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }}
6 | labels:
7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicyBinding
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi
5 | spec:
6 | policyName: resourceslices-policy-dra-kubelet-plugin-gaudi
7 | validationActions: [Deny]
8 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicy
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi
5 | spec:
6 | failurePolicy: Fail
7 | matchConstraints:
8 | resourceRules:
9 | - apiGroups: ["resource.k8s.io"]
10 | apiVersions: ["v1beta1"]
11 | operations: ["CREATE", "UPDATE", "DELETE"]
12 | resources: ["resourceslices"]
13 | matchConditions:
14 | - name: isRestrictedUser
15 | expression: >-
16 | request.userInfo.username == "system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account"
17 | variables:
18 | - name: userNodeName
19 | expression: >-
20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
21 | - name: objectNodeName
22 | expression: >-
23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
24 | validations:
25 | - expression: variables.userNodeName != ""
26 | message: >-
27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
28 | - expression: variables.userNodeName == variables.objectNodeName
29 | messageExpression: >-
30 | "this user running on node '"+variables.userNodeName+"' may not modify " +
31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
32 |
--------------------------------------------------------------------------------
/charts/intel-gaudi-resource-driver/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for intel-gaudi-resource-driver.
2 | nameOverride: ""
3 | namespaceOverride: "intel-gaudi-resource-driver"
4 | fullnameOverride: ""
5 | selectorLabelsOverride: {}
6 |
7 | imagePullSecrets: []
8 | image:
9 | repository: intel
10 | name: intel-gaudi-resource-driver
11 | pullPolicy: IfNotPresent
12 | tag: "v0.3.0"
13 |
14 | serviceAccount:
15 | create: true
16 | annotations: {}
17 | name: intel-gaudi-resource-driver-service-account
18 | automount: true
19 |
20 | kubeletPlugin:
21 | podAnnotations: {}
22 | nodeSelector: {}
23 | # label used when nfd.enabled is true
24 | #intel.feature.node.kubernetes.io/gaudi: "true"
25 | tolerations:
26 | - key: node-role.kubernetes.io/master
27 | operator: Exists
28 | effect: NoSchedule
29 | - key: node-role.kubernetes.io/control-plane
30 | operator: Exists
31 | effect: NoSchedule
32 | # Refer to the official documentation for Node Feature Discovery (NFD)
33 | # regarding node tainting:
34 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
35 | - key: "intel.feature.node.kubernetes.io/gaudi"
36 | operator: "Exists"
37 | effect: "NoSchedule"
38 | affinity: {}
39 |
40 | nfd:
41 | enabled: false # change to true to install NFD to the cluster
42 | nameOverride: intel-gaudi-nfd
43 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16):
44 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters
45 | enableNodeFeatureApi: true
46 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | # Common backup files
9 | *.swp
10 | *.bak
11 | *.tmp
12 | *.orig
13 | *~
14 | # Various IDEs
15 | .project
16 | .idea/
17 | *.tmproj
18 | .vscode/
19 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: intel-gpu-resource-driver
3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver
4 |
5 | type: application
6 | version: 0.7.0
7 | appVersion: "v0.7.0"
8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts
9 |
10 | dependencies:
11 | - name: node-feature-discovery
12 | alias: nfd
13 | version: "0.17.1"
14 | condition: nfd.enabled
15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
16 |
17 | annotations:
18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
20 | org.opencontainers.image.version: "0.7.0"
21 | org.opencontainers.image.title: "Intel GPU Resource Driver"
22 | org.opencontainers.image.description: "This chart installs the Intel GPU resource driver on Kubernetes."
23 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Resource Allocation (DRA) Intel GPU Driver Helm Chart
2 |
3 | ## The chart installs GPU resource driver:
4 |
5 | - [GPU](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gpu/README.md)
6 |
7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main)
8 |
9 |
10 | ## Installing the chart
11 |
12 | ```
13 | helm install \
14 | --namespace "intel-gpu-resource-driver" \
15 | --create-namespace \
16 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver
17 | ```
18 |
19 | > [!NOTE]
20 | > For Kubernetes clusters using [Pod Security Standards](https://kubernetes.io/docs/concepts/security/pod-security-standards/),
21 | > pre-create the namespace with the respective label allowing to use HostPath Volumes.
22 |
23 | ```
24 | kubectl create namespace intel-gpu-resource-driver
25 | kubectl label --overwrite namespace intel-gpu-resource-driver pod-security.kubernetes.io/enforce=privileged
26 | helm install \
27 | --namespace "intel-gpu-resource-driver" \
28 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver
29 | ```
30 |
31 | ## Uninstalling the chart
32 | ```
33 | helm uninstall intel-gpu-resource-driver --namespace intel-gpu-resource-driver
34 | ```
35 | (Optional) Delete the namespace:
36 | ```
37 | kubectl delete ns intel-gpu-resource-driver
38 | ```
39 |
40 | ## Configuration
41 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
42 |
43 | ```console
44 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver
45 | ```
46 |
47 | You may also run `helm show values` on this chart's dependencies for additional options.
48 |
49 | | Key | Type | Default |
50 | |-----|------|---------|
51 | | image.repository | string | `intel` |
52 | | image.name | string | `"intel-gpu-resource-driver"` |
53 | | image.pullPolicy | string | `"IfNotPresent"` |
54 | | image.tag | string | `"v0.7.0"` |
55 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Thank you for installing {{ .Chart.Name }}.
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* Define common helpers */}}
2 | {{- define "intel-gpu-resource-driver.chart" -}}
3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
4 | {{- end }}
5 |
6 | {{/* Define the base name for the driver */}}
7 | {{- define "intel-gpu-resource-driver.baseName" -}}
8 | intel-gpu-resource-driver
9 | {{- end }}
10 |
11 | {{- define "intel-gpu-resource-driver.name" -}}
12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
13 | {{- end }}
14 |
15 | {{- define "intel-gpu-resource-driver.fullname" -}}
16 | {{- if .Values.fullnameOverride -}}
17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
18 | {{- else -}}
19 | {{- printf "%s-%s" (include "intel-gpu-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}}
20 | {{- end -}}
21 | {{- end }}
22 |
23 | {{/* Labels for templates */}}
24 | {{- define "intel-gpu-resource-driver.labels" -}}
25 | helm.sh/chart: {{ include "intel-gpu-resource-driver.chart" . }}
26 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
27 | app.kubernetes.io/managed-by: {{ .Release.Service }}
28 | {{- end }}
29 |
30 | {{- define "intel-gpu-resource-driver.clusterRoleName" -}}
31 | {{- printf "%s-role" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
32 | {{- end }}
33 |
34 | {{- define "intel-gpu-resource-driver.clusterRoleBindingName" -}}
35 | {{- printf "%s-rolebinding" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
36 | {{- end }}
37 |
38 | {{- define "intel-gpu-resource-driver.serviceAccountName" -}}
39 | {{- if .Values.serviceAccount.create -}}
40 | {{- default "intel-gpu-sa" .Values.serviceAccount.name -}}
41 | {{- end -}}
42 | {{- end }}
43 |
44 | {{/* Define full image name */}}
45 | {{- define "intel-gpu-resource-driver.fullimage" -}}
46 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}}
47 | {{- end }}
48 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }}
5 | namespace: {{ .Release.Namespace }}
6 | rules:
7 | - apiGroups: [""]
8 | resources: ["nodes"]
9 | verbs: ["get"]
10 | - apiGroups: ["resource.k8s.io"]
11 | resources: ["resourceslices"]
12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
13 | - apiGroups: ["resource.k8s.io"]
14 | resources: ["resourceclaims"]
15 | verbs: ["get"]
16 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: {{ include "intel-gpu-resource-driver.clusterRoleBindingName" . }}
5 | namespace: {{ .Release.Namespace }}
6 | subjects:
7 | - kind: ServiceAccount
8 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }}
9 | namespace: {{ .Release.Namespace }}
10 | roleRef:
11 | kind: ClusterRole
12 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }}
13 | apiGroup: rbac.authorization.k8s.io
14 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: gpu.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "gpu.intel.com"
10 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/node-feature-rules.yaml:
--------------------------------------------------------------------------------
1 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }}
2 | apiVersion: nfd.k8s-sigs.io/v1alpha1
3 | kind: NodeFeatureRule
4 | metadata:
5 | name: intel-gpu-device-rule
6 | spec:
7 | rules:
8 | - name: "intel.gpu"
9 | labels:
10 | "intel.feature.node.kubernetes.io/gpu": "true"
11 | matchFeatures:
12 | - feature: pci.device
13 | matchExpressions:
14 | vendor: {op: In, value: ["8086"]}
15 | class: {op: In, value: ["0300", "0380"]}
16 | matchAny:
17 | - matchFeatures:
18 | - feature: kernel.loadedmodule
19 | matchExpressions:
20 | i915: {op: Exists}
21 | - matchFeatures:
22 | - feature: kernel.enabledmodule
23 | matchExpressions:
24 | i915: {op: Exists}
25 | ---
26 | apiVersion: nfd.k8s-sigs.io/v1alpha1
27 | kind: NodeFeatureRule
28 | metadata:
29 | name: intel-gpu-platform-labeling
30 | spec:
31 | rules:
32 | # A_Series (Alchemist)
33 | - labels:
34 | gpu.intel.com/family: "A_Series"
35 | matchFeatures:
36 | - feature: pci.device
37 | matchExpressions:
38 | class: {op: In, value: ["0300"]}
39 | vendor: {op: In, value: ["8086"]}
40 | device:
41 | op: In
42 | value:
43 | - "56a6"
44 | - "56a5"
45 | - "56a1"
46 | - "56a0"
47 | - "5694"
48 | - "5693"
49 | - "5692"
50 | - "5691"
51 | - "5690"
52 | - "56b3"
53 | - "56b2"
54 | - "56a4"
55 | - "56a3"
56 | - "5697"
57 | - "5696"
58 | - "5695"
59 | - "56b1"
60 | - "56b0"
61 | name: intel.gpu.a.series
62 | # Max_Series
63 | - labels:
64 | gpu.intel.com/family: "Max_Series"
65 | matchFeatures:
66 | - feature: pci.device
67 | matchExpressions:
68 | class: {op: In, value: ["0380"]}
69 | vendor: {op: In, value: ["8086"]}
70 | device:
71 | op: In
72 | value:
73 | - "0bda"
74 | - "0bd5"
75 | - "0bd9"
76 | - "0bdb"
77 | - "0bd7"
78 | - "0bd6"
79 | - "0bd0"
80 | name: intel.gpu.max.series
81 | # Flex_Series
82 | - labels:
83 | gpu.intel.com/family: "Flex_Series"
84 | matchFeatures:
85 | - feature: pci.device
86 | matchExpressions:
87 | class: {op: In, value: ["0300", "0380"]}
88 | vendor: {op: In, value: ["8086"]}
89 | device:
90 | op: In
91 | value:
92 | - "0f00"
93 | - "0f01"
94 | - "0f02"
95 | name: intel.gpu.flex.series
96 | {{- end }}
97 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/resource-driver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gpu-resource-driver-kubelet-plugin
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }}
8 | spec:
9 | selector:
10 | matchLabels:
11 | app: intel-gpu-resource-driver
12 | template:
13 | metadata:
14 | labels:
15 | app: intel-gpu-resource-driver
16 | spec:
17 | serviceAccountName: {{ include "intel-gpu-resource-driver.serviceAccountName" . }}
18 | containers:
19 | - name: kubelet-plugin
20 | image: {{ include "intel-gpu-resource-driver.fullimage" . }}
21 | imagePullPolicy: {{ .Values.image.pullPolicy }}
22 | command: ["/kubelet-gpu-plugin"]
23 | env:
24 | - name: NODE_NAME
25 | valueFrom:
26 | fieldRef:
27 | fieldPath: spec.nodeName
28 | - name: POD_NAMESPACE
29 | valueFrom:
30 | fieldRef:
31 | fieldPath: metadata.namespace
32 | - name: SYSFS_ROOT
33 | value: "/sysfs"
34 | volumeMounts:
35 | - name: plugins-registry
36 | mountPath: /var/lib/kubelet/plugins_registry
37 | - name: plugins
38 | mountPath: /var/lib/kubelet/plugins
39 | - name: cdi
40 | mountPath: /etc/cdi
41 | - name: varruncdi
42 | mountPath: /var/run/cdi
43 | # when using fake sysfs - mount at the same place as on host
44 | - name: sysfs
45 | mountPath: "/sysfs"
46 | securityContext:
47 | privileged: false
48 | allowPrivilegeEscalation: false
49 | capabilities:
50 | drop: ["ALL"]
51 | readOnlyRootFilesystem: true
52 | runAsUser: 0
53 | seccompProfile:
54 | type: RuntimeDefault
55 | volumes:
56 | - name: plugins-registry
57 | hostPath:
58 | path: /var/lib/kubelet/plugins_registry
59 | - name: plugins
60 | hostPath:
61 | path: /var/lib/kubelet/plugins
62 | - name: cdi
63 | hostPath:
64 | path: {{ .Values.cdi.staticPath }}
65 | - name: varruncdi
66 | hostPath:
67 | path: {{ .Values.cdi.dynamicPath}}
68 | - name: sysfs
69 | hostPath:
70 | path: /sys
71 | {{- with .Values.kubeletPlugin.tolerations }}
72 | tolerations:
73 | {{- toYaml . | nindent 8 }}
74 | {{- end }}
75 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }}
76 | nodeSelector:
77 | intel.feature.node.kubernetes.io/gpu: "true"
78 | {{- else }}
79 | {{- with .Values.kubeletPlugin.nodeSelector }}
80 | nodeSelector:
81 | {{- toYaml . | nindent 8 }}
82 | {{- end }}
83 | {{- end }}
84 | {{- with .Values.kubeletPlugin.affinity }}
85 | affinity:
86 | {{- toYaml . | nindent 8 }}
87 | {{- end }}
88 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }}
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicyBinding
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-gpu
5 | spec:
6 | policyName: resourceslices-policy-dra-kubelet-plugin-gpu
7 | validationActions: [Deny]
8 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicy
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-gpu
5 | spec:
6 | failurePolicy: Fail
7 | matchConstraints:
8 | resourceRules:
9 | - apiGroups: ["resource.k8s.io"]
10 | apiVersions: ["v1beta1"]
11 | operations: ["CREATE", "UPDATE", "DELETE"]
12 | resources: ["resourceslices"]
13 | matchConditions:
14 | - name: isRestrictedUser
15 | expression: >-
16 | request.userInfo.username == "system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gpu-resource-driver.serviceAccountName" . }}"
17 | variables:
18 | - name: userNodeName
19 | expression: >-
20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
21 | - name: objectNodeName
22 | expression: >-
23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
24 | validations:
25 | - expression: variables.userNodeName != ""
26 | message: >-
27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
28 | - expression: variables.userNodeName == variables.objectNodeName
29 | messageExpression: >-
30 | "this user running on node '"+variables.userNodeName+"' may not modify " +
31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
32 |
--------------------------------------------------------------------------------
/charts/intel-gpu-resource-driver/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for intel-gpu-resource-driver.
2 | nameOverride: ""
3 | fullnameOverride: ""
4 | selectorLabelsOverride: {}
5 |
6 | imagePullSecrets: []
7 | image:
8 | repository: intel
9 | name: intel-gpu-resource-driver
10 | pullPolicy: IfNotPresent
11 | tag: "v0.7.0"
12 |
13 | serviceAccount:
14 | create: true
15 | annotations: {}
16 | name: ""
17 | automount: true
18 |
19 | kubeletPlugin:
20 | podAnnotations: {}
21 | nodeSelector: {} # ignored when .Values.nodeFeatureRules.enabled or .Values.nfd.enabled
22 | tolerations:
23 | - key: node-role.kubernetes.io/master
24 | operator: Exists
25 | effect: NoSchedule
26 | - key: node-role.kubernetes.io/control-plane
27 | operator: Exists
28 | effect: NoSchedule
29 | # Refer to the official documentation for Node Feature Discovery (NFD)
30 | # regarding node tainting:
31 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
32 | - key: "node.kubernetes.io/gpu"
33 | operator: "Exists"
34 | effect: "NoSchedule"
35 | affinity: {}
36 |
37 | cdi:
38 | staticPath: /etc/cdi
39 | dynamicPath: /var/run/cdi
40 |
41 | nodeFeatureRules:
42 | enabled: false
43 |
44 | nfd:
45 | enabled: false # change to true to install NFD to the cluster
46 | nameOverride: intel-gpu-nfd
47 | enableNodeFeatureApi: true
48 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: intel-qat-resource-driver
3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel QAT Resource Driver
4 |
5 | type: application
6 | version: 0.2.0
7 | appVersion: "v0.2.0"
8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts
9 |
10 | dependencies:
11 | - name: node-feature-discovery
12 | alias: nfd
13 | version: "0.17.1"
14 | condition: nfd.enabled
15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
16 |
17 | annotations:
18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes"
20 | org.opencontainers.image.version: "0.2.0"
21 | org.opencontainers.image.title: "Intel QAT Resource Driver"
22 | org.opencontainers.image.description: "This chart installs the Intel QAT resource driver on Kubernetes."
23 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Resource Allocation (DRA) Intel QAT Driver Helm Chart
2 |
3 | ## The chart installs QAT resource driver:
4 |
5 | - [QAT](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/qat/README.md)
6 |
7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main)
8 |
9 |
10 | ## Installing the chart
11 |
12 | ```
13 | helm install intel-qat-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver \
14 | --create-namespace \
15 | --namespace intel-qat-resource-driver
16 | ```
17 |
18 | ## Uninstalling the chart
19 | ```
20 | helm uninstall intel-qat-resource-driver --namespace intel-qat-resource-driver
21 | ```
22 | (Optional) Delete the namespace:
23 | ```
24 | kubectl delete ns intel-qat-resource-driver
25 | ```
26 |
27 | ## Configuration
28 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
29 |
30 | ```console
31 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver
32 | ```
33 |
34 | You may also run `helm show values` on this chart's dependencies for additional options.
35 |
36 | | Key | Type | Default |
37 | |-----|------|---------|
38 | | image.repository | string | `intel` |
39 | | image.name | string | `"intel-qat-resource-driver"` |
40 | | image.pullPolicy | string | `"IfNotPresent"` |
41 | | image.tag | string | `"v0.2.0"` |
42 |
43 | If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases.
44 |
45 |
46 | ## Read-only file system error for QAT
47 |
48 | When the following error appears in the logs of the QAT Kubelet plugin:
49 | ```
50 | kubectl logs -n intel-qat-resource-driver intel-qat-resource-driver-kubelet-plugin-ttcs6
51 | DRA kubelet plugin
52 | In-cluster config
53 | Setting up CDI
54 | failed to create kubelet plugin driver: cannot enable PF device '0000:6b:00.0': open /sysfs/bus/pci/devices/0000:6b:00.0/sriov_numvfs: read-only file system
55 | ```
56 |
57 | Try reseting QAT by reloading its kernel driver:
58 | ```
59 | rmmod qat_4xxx
60 | modprobe qat_4xxx
61 | ```
62 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Thank you for installing {{ .Chart.Name }}.
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* Define common helpers */}}
2 | {{- define "intel-qat-resource-driver.chart" -}}
3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
4 | {{- end }}
5 |
6 | {{/* Define the base name for the driver */}}
7 | {{- define "intel-qat-resource-driver.baseName" -}}
8 | intel-qat-resource-driver
9 | {{- end }}
10 |
11 | {{- define "intel-qat-resource-driver.name" -}}
12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
13 | {{- end }}
14 |
15 | {{- define "intel-qat-resource-driver.fullname" -}}
16 | {{- if .Values.fullnameOverride -}}
17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
18 | {{- else -}}
19 | {{- printf "%s-%s" (include "intel-qat-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}}
20 | {{- end -}}
21 | {{- end }}
22 |
23 | {{- define "intel-qat-resource-driver.namespace" -}}
24 | {{- default .Release.Namespace .Values.namespaceOverride }}
25 | {{- end }}
26 |
27 | {{/* Labels for templates */}}
28 | {{- define "intel-qat-resource-driver.labels" -}}
29 | helm.sh/chart: {{ include "intel-qat-resource-driver.chart" . }}
30 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
31 | app.kubernetes.io/managed-by: {{ .Release.Service }}
32 | {{- end }}
33 |
34 | {{- define "intel-qat-resource-driver.clusterRoleName" -}}
35 | {{- printf "%s-role" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
36 | {{- end }}
37 |
38 | {{- define "intel-qat-resource-driver.clusterRoleBindingName" -}}
39 | {{- printf "%s-rolebinding" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }}
40 | {{- end }}
41 |
42 | {{- define "intel-qat-resource-driver.serviceAccountName" -}}
43 | {{- if .Values.serviceAccount.create -}}
44 | {{- default "intel-qat-sa" .Values.serviceAccount.name -}}
45 | {{- end -}}
46 | {{- end }}
47 |
48 | {{/* Define full image name */}}
49 | {{- define "intel-qat-resource-driver.fullimage" -}}
50 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}}
51 | {{- end }}
52 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }}
5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }}
6 | rules:
7 | - apiGroups: [""]
8 | resources: ["nodes"]
9 | verbs: ["get"]
10 | - apiGroups: ["resource.k8s.io"]
11 | resources: ["resourceslices"]
12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
13 | - apiGroups: ["resource.k8s.io"]
14 | resources: ["resourceclaims"]
15 | verbs: ["get"]
16 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: {{ include "intel-qat-resource-driver.clusterRoleBindingName" . }}
5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }}
6 | subjects:
7 | - kind: ServiceAccount
8 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }}
9 | namespace: {{ include "intel-qat-resource-driver.namespace" . }}
10 | roleRef:
11 | kind: ClusterRole
12 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }}
13 | apiGroup: rbac.authorization.k8s.io
14 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: qat.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "qat.intel.com"
10 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/nfd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: nfd.k8s-sigs.io/v1alpha1
2 | kind: NodeFeatureRule
3 | metadata:
4 | name: intel-qat-device-rule
5 | spec:
6 | rules:
7 | - name: "intel.qat"
8 | labels:
9 | feature.node.kubernetes.io/qat: "true"
10 | matchFeatures:
11 | - feature: pci.device
12 | matchExpressions:
13 | vendor: {op: In, value: ["8086"]}
14 | device: {op: In, value: ["4940", "4941", "4944", "4946"]}
15 | class: {op: In, value: ["0b40"]}
16 | - feature: kernel.loadedmodule
17 | matchExpressions:
18 | intel_qat: {op: Exists}
19 | matchAny:
20 | - matchFeatures:
21 | - feature: kernel.loadedmodule
22 | matchExpressions:
23 | vfio_pci: {op: Exists}
24 | - matchFeatures:
25 | - feature: kernel.enabledmodule
26 | matchExpressions:
27 | vfio-pci: {op: Exists}
28 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/resource-driver-namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: intel-qat-resource-driver
5 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/resource-driver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-qat-resource-driver-kubelet-plugin
5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }}
6 | labels:
7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }}
8 | spec:
9 | selector:
10 | matchLabels:
11 | app: intel-qat-resource-driver
12 | template:
13 | metadata:
14 | labels:
15 | app: intel-qat-resource-driver
16 | spec:
17 | serviceAccount: intel-qat-resource-driver-service-account
18 | serviceAccountName: {{ include "intel-qat-resource-driver.serviceAccountName" . }}
19 | containers:
20 | - name: kubelet-plugin
21 | image: {{ include "intel-qat-resource-driver.fullimage" . }}
22 | imagePullPolicy: {{ .Values.image.pullPolicy }}
23 | command: ["/kubelet-qat-plugin"]
24 | env:
25 | - name: NODE_NAME
26 | valueFrom:
27 | fieldRef:
28 | fieldPath: spec.nodeName
29 | - name: POD_NAMESPACE
30 | valueFrom:
31 | fieldRef:
32 | fieldPath: metadata.namespace
33 | - name: SYSFS_ROOT
34 | value: "/sysfs"
35 | volumeMounts:
36 | - name: plugins-registry
37 | mountPath: /var/lib/kubelet/plugins_registry
38 | - name: plugins
39 | mountPath: /var/lib/kubelet/plugins
40 | - name: cdi
41 | mountPath: /etc/cdi
42 | - name: varruncdi
43 | mountPath: /var/run/cdi
44 | - name: sysfs
45 | mountPath: /sysfs
46 | - name: qatconfiguration
47 | mountPath: /defaults
48 | securityContext:
49 | privileged: true
50 | readOnlyRootFilesystem: true
51 | seccompProfile:
52 | type: RuntimeDefault
53 | volumes:
54 | - name: plugins-registry
55 | hostPath:
56 | path: /var/lib/kubelet/plugins_registry
57 | - name: plugins
58 | hostPath:
59 | path: /var/lib/kubelet/plugins
60 | - name: cdi
61 | hostPath:
62 | path: /etc/cdi
63 | - name: varruncdi
64 | hostPath:
65 | path: /var/run/cdi
66 | - name: sysfs
67 | hostPath:
68 | path: /sys
69 | - name: qatconfiguration
70 | configMap:
71 | name: intel-qat-resource-driver-configuration
72 | optional: true
73 | {{- with .Values.kubeletPlugin.tolerations }}
74 | tolerations:
75 | {{- toYaml . | nindent 8 }}
76 | {{- end }}
77 | {{- with .Values.kubeletPlugin.nodeSelector }}
78 | nodeSelector:
79 | {{- toYaml . | nindent 8 }}
80 | {{- end }}
81 | {{- with .Values.kubeletPlugin.affinity }}
82 | affinity:
83 | {{- toYaml . | nindent 8 }}
84 | {{- end }}
85 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }}
5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }}
6 | labels:
7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/validating-admission-policy-binding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicyBinding
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-qat
5 | spec:
6 | policyName: resourceslices-policy-dra-kubelet-plugin-qat
7 | validationActions: [Deny]
8 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/templates/validating-admission-policy.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicy
3 | metadata:
4 | name: resourceslices-policy-dra-kubelet-plugin-qat
5 | spec:
6 | failurePolicy: Fail
7 | matchConstraints:
8 | resourceRules:
9 | - apiGroups: ["resource.k8s.io"]
10 | apiVersions: ["v1beta1"]
11 | operations: ["CREATE", "UPDATE", "DELETE"]
12 | resources: ["resourceslices"]
13 | matchConditions:
14 | - name: isRestrictedUser
15 | expression: >-
16 | request.userInfo.username == "system:serviceaccount:intel-qat-resource-driver:intel-qat-resource-driver-service-account"
17 | variables:
18 | - name: userNodeName
19 | expression: >-
20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
21 | - name: objectNodeName
22 | expression: >-
23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
24 | validations:
25 | - expression: variables.userNodeName != ""
26 | message: >-
27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
28 | - expression: variables.userNodeName == variables.objectNodeName
29 | messageExpression: >-
30 | "this user running on node '"+variables.userNodeName+"' may not modify " +
31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
32 |
--------------------------------------------------------------------------------
/charts/intel-qat-resource-driver/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for intel-qat-resource-driver.
2 | nameOverride: ""
3 | namespaceOverride: "intel-qat-resource-driver"
4 | fullnameOverride: ""
5 | selectorLabelsOverride: {}
6 |
7 | imagePullSecrets: []
8 | image:
9 | repository: intel
10 | name: intel-qat-resource-driver
11 | pullPolicy: IfNotPresent
12 | tag: "v0.2.0"
13 |
14 | serviceAccount:
15 | create: true
16 | annotations: {}
17 | name: "intel-qat-resource-driver-service-account"
18 | automount: true
19 |
20 | kubeletPlugin:
21 | podAnnotations: {}
22 | nodeSelector:
23 | feature.node.kubernetes.io/qat: "true"
24 | tolerations:
25 | - key: node-role.kubernetes.io/master
26 | operator: Exists
27 | effect: NoSchedule
28 | - key: node-role.kubernetes.io/control-plane
29 | operator: Exists
30 | effect: NoSchedule
31 | # Refer to the official documentation for Node Feature Discovery (NFD)
32 | # regarding node tainting:
33 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
34 | - key: "node.kubernetes.io/qat"
35 | operator: "Exists"
36 | effect: "NoSchedule"
37 | affinity: {}
38 |
39 | nfd:
40 | enabled: false # change to true to install NFD to the cluster
41 | nameOverride: intel-qat-nfd
42 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16):
43 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters
44 | enableNodeFeatureApi: true
45 |
--------------------------------------------------------------------------------
/cmd/kubelet-gaudi-plugin/main.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package main
18 |
19 | import (
20 | "fmt"
21 | "os"
22 |
23 | "github.com/urfave/cli/v2"
24 |
25 | gaudi "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device"
26 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers"
27 | )
28 |
29 | type GaudiFlags struct {
30 | Healthcare bool
31 | HealthcareInterval int
32 | }
33 |
34 | const (
35 | HealthCareFlagDefault = false
36 | HealthcareIntervalFlagMin = 1
37 | HealthcareIntervalFlagMax = 3600
38 | HealthcareIntervalFlagDefault = 5
39 | )
40 |
41 | func main() {
42 | gaudiFlags := GaudiFlags{
43 | Healthcare: HealthCareFlagDefault,
44 | HealthcareInterval: HealthcareIntervalFlagDefault,
45 | }
46 |
47 | cliFlags := []cli.Flag{
48 | &cli.BoolFlag{
49 | Name: "health-monitoring",
50 | Aliases: []string{"m"},
51 | Usage: "Actively monitor device health and update ResourceSlice. Requires privileges.",
52 | Value: HealthCareFlagDefault,
53 | Destination: &gaudiFlags.Healthcare,
54 | EnvVars: []string{"HEALTH_MONITORING"},
55 | },
56 | &cli.IntFlag{
57 | Name: "health-interval",
58 | Aliases: []string{"i"},
59 | Usage: fmt.Sprintf("Number of seconds between health-monitoring checks [%v ~ %v]", HealthcareIntervalFlagMin, HealthcareIntervalFlagMax),
60 | Value: HealthcareIntervalFlagDefault,
61 | Destination: &gaudiFlags.HealthcareInterval,
62 | EnvVars: []string{"HEALTH_INTERVAL"},
63 | },
64 | }
65 |
66 | if err := helpers.NewApp(gaudi.DriverName, newDriver, cliFlags, &gaudiFlags).Run(os.Args); err != nil {
67 | fmt.Fprintf(os.Stderr, "Error: %v\n", err)
68 | os.Exit(1)
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/cmd/kubelet-gaudi-plugin/node_state_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package main
18 |
19 | import (
20 | "reflect"
21 | "testing"
22 |
23 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device"
24 | )
25 |
26 | func TestDeviceInfoDeepCopy(t *testing.T) {
27 | di := device.DeviceInfo{
28 | UID: "f",
29 | Model: "ff",
30 | }
31 |
32 | dc := di.DeepCopy()
33 |
34 | if !reflect.DeepEqual(&di, dc) {
35 | t.Fatalf("device infos %v and %v do not match", di, dc)
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/cmd/kubelet-gpu-plugin/main.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package main
18 |
19 | import (
20 | "fmt"
21 | "os"
22 |
23 | "github.com/urfave/cli/v2"
24 |
25 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gpu/device"
26 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers"
27 | )
28 |
29 | type GPUFlags struct {
30 | Partitioning bool
31 | }
32 |
33 | const (
34 | PartitioningDefault = false
35 | )
36 |
37 | func main() {
38 | gpuFlags := GPUFlags{}
39 | cliFlags := []cli.Flag{
40 | &cli.BoolFlag{
41 | Name: "partitioning-management",
42 | Aliases: []string{"p"},
43 | Usage: "Manage partitioning physical devices into virtual. [Not Supported]",
44 | Value: PartitioningDefault,
45 | Destination: &gpuFlags.Partitioning,
46 | EnvVars: []string{"PARTITIONING"},
47 | },
48 | }
49 |
50 | if err := helpers.NewApp(device.DriverName, newDriver, cliFlags, &gpuFlags).Run(os.Args); err != nil {
51 | fmt.Fprintf(os.Stderr, "Error: %v\n", err)
52 | os.Exit(1)
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/cmd/kubelet-gpu-plugin/test-claims/empty.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/cmd/kubelet-gpu-plugin/test-claims/invalid.json:
--------------------------------------------------------------------------------
1 | {"foo":"bar",}
2 |
--------------------------------------------------------------------------------
/cmd/kubelet-gpu-plugin/test-claims/multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "uid1": [
3 | {
4 | "request_names": [
5 | "request1"
6 | ],
7 | "pool_name": "node1",
8 | "device_name": "0000-af-00-1-0xabcd",
9 | "cdi_device_ids": [
10 | "0000-af-00-1-0xabcd"
11 | ]
12 | }
13 | ],
14 | "uid2": [
15 | {
16 | "request_names": [
17 | "request1"
18 | ],
19 | "pool_name": "node1",
20 | "device_name": "0000-af-00-2-0xabcd",
21 | "cdi_device_ids": [
22 | "0000-af-00-2-0xabcd"
23 | ]
24 | }
25 | ],
26 | "uid3": [
27 | {
28 | "request_names": [
29 | "request1"
30 | ],
31 | "pool_name": "node1",
32 | "device_name": "0000-af-00-3-0xabcd",
33 | "cdi_device_ids": [
34 | "0000-af-00-3-0xabcd"
35 | ]
36 | }
37 | ]
38 | }
--------------------------------------------------------------------------------
/cmd/kubelet-qat-plugin/clientsets.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package main
6 |
7 | import (
8 | "fmt"
9 | "os"
10 |
11 | "k8s.io/client-go/kubernetes"
12 | "k8s.io/client-go/rest"
13 | "k8s.io/client-go/tools/clientcmd"
14 | "k8s.io/klog/v2"
15 | )
16 |
17 | type ClientSet struct {
18 | csconfig *rest.Config
19 | }
20 |
21 | type KubeClient kubernetes.Interface
22 |
23 | // Create a new client config. Use KUBECONFIG environment variable if set,
24 | // othewise resort to in-cluster config.
25 | func (c *ClientSet) newClientSetConfig() error {
26 | var err error
27 |
28 | if c.csconfig != nil {
29 | return nil
30 | }
31 |
32 | kubeconfenv := os.Getenv("KUBECONFIG")
33 | if kubeconfenv == "" {
34 | klog.V(5).Info("In-cluster config")
35 |
36 | c.csconfig, err = rest.InClusterConfig()
37 | if err != nil {
38 | return fmt.Errorf("creating in-cluster client configuration: %v", err)
39 | }
40 | } else {
41 | klog.V(5).Infof("Using env variable KUBECONFIG=%s", kubeconfenv)
42 |
43 | c.csconfig, err = clientcmd.BuildConfigFromFlags("", kubeconfenv)
44 | if err != nil {
45 | return fmt.Errorf("creating out-of-cluster client configuration: %v", err)
46 | }
47 |
48 | }
49 |
50 | return nil
51 | }
52 |
53 | func (c *ClientSet) NewKubeClient() (KubeClient, error) {
54 | if err := c.newClientSetConfig(); err != nil {
55 | return nil, err
56 | }
57 |
58 | kubeclient, err := kubernetes.NewForConfig(c.csconfig)
59 | if err != nil {
60 | return nil, fmt.Errorf("creating kubernetes client: %v", err)
61 | }
62 |
63 | return kubeclient, nil
64 | }
65 |
--------------------------------------------------------------------------------
/cmd/kubelet-qat-plugin/config.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package main
6 |
7 | import (
8 | "encoding/json"
9 | "fmt"
10 | "os"
11 |
12 | "k8s.io/klog/v2"
13 |
14 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device"
15 | )
16 |
17 | const defaultConfigFile = "/defaults/qatdefaults.config"
18 |
19 | func readConfigFile(hostname string) (map[string]string, error) {
20 | configBytes, err := os.ReadFile(defaultConfigFile)
21 | if err != nil {
22 | return nil, err
23 | }
24 |
25 | var configFile map[string]map[string]string
26 | if err := json.Unmarshal(configBytes, &configFile); err != nil {
27 | return nil, err
28 | }
29 |
30 | hostConfig, exists := configFile[hostname]
31 | if !exists {
32 | return nil, fmt.Errorf("no config for host '%s' found", hostname)
33 | }
34 |
35 | return hostConfig, nil
36 | }
37 |
38 | func getDefaultConfiguration(hostname string, q device.QATDevices) error {
39 | serviceconfig, err := readConfigFile(hostname)
40 | if err != nil {
41 | klog.Infof("Could not read default config file - leaving unconfigured: %v", err)
42 | return nil
43 | }
44 |
45 | klog.V(5).Infof("Default config for host '%s':", hostname)
46 | for _, pf := range q {
47 | if servicestr, exists := serviceconfig[pf.Device]; exists {
48 | var services device.Services
49 | var err error
50 |
51 | if services, err = device.StringToServices(servicestr); err != nil {
52 | klog.Warningf("Error parsing default config services for PF device '%s': %v", pf.Device, err)
53 | continue
54 | }
55 |
56 | if err := pf.SetServices([]device.Services{services}); err != nil {
57 | klog.Warningf("Error configuring services '%s' for PF device '%s': %v", services.String(), pf.Device, err)
58 | continue
59 | }
60 |
61 | klog.V(5).Infof("PF device '%s' configured with services %s'", pf.Device, services.String())
62 | }
63 | }
64 |
65 | return nil
66 | }
67 |
--------------------------------------------------------------------------------
/cmd/kubelet-qat-plugin/deviceresources.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package main
6 |
7 | import (
8 | resourceapi "k8s.io/api/resource/v1beta1"
9 | "k8s.io/klog/v2"
10 | "k8s.io/utils/ptr"
11 |
12 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device"
13 | )
14 |
15 | func deviceResources(qatvfdevices device.VFDevices) *[]resourceapi.Device {
16 | resourcedevices := []resourceapi.Device{}
17 |
18 | for _, qatvfdevice := range qatvfdevices {
19 | device := resourceapi.Device{
20 | Name: qatvfdevice.UID(),
21 | Basic: &resourceapi.BasicDevice{
22 | Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
23 | "services": {
24 | StringValue: ptr.To(qatvfdevice.Services()),
25 | },
26 | },
27 | },
28 | }
29 | resourcedevices = append(resourcedevices, device)
30 |
31 | klog.V(5).Infof("Adding Device resource: name '%s', service '%s'", device.Name, *device.Basic.Attributes["services"].StringValue)
32 | }
33 |
34 | return &resourcedevices
35 | }
36 |
--------------------------------------------------------------------------------
/cmd/kubelet-qat-plugin/main.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package main
6 |
7 | import (
8 | "context"
9 | "fmt"
10 | "os"
11 | "os/signal"
12 | "syscall"
13 |
14 | "github.com/spf13/cobra"
15 | utilruntime "k8s.io/apimachinery/pkg/util/runtime"
16 | cliflag "k8s.io/component-base/cli/flag"
17 | "k8s.io/component-base/featuregate"
18 | "k8s.io/component-base/logs"
19 | logsapi "k8s.io/component-base/logs/api/v1"
20 | "k8s.io/component-base/term"
21 | "k8s.io/dynamic-resource-allocation/kubeletplugin"
22 | "k8s.io/klog/v2"
23 |
24 | driverVersion "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/version"
25 | )
26 |
27 | func cmdRun(cmd *cobra.Command, args []string) error {
28 | var (
29 | d *driver
30 | err error
31 | )
32 |
33 | klog.Info("DRA QAT kubelet plugin")
34 | driverVersion.PrintDriverVersion(driverName)
35 |
36 | ctx := context.Background()
37 |
38 | if err := os.MkdirAll(driverPluginPath, 0750); err != nil {
39 | return fmt.Errorf("could not create '%s': %v", driverPluginPath, err)
40 | }
41 |
42 | if d, err = newDriver(ctx); err != nil {
43 | return fmt.Errorf("failed to create kubelet plugin driver: %v", err)
44 | }
45 |
46 | plugin, err := kubeletplugin.Start(
47 | ctx,
48 | []any{d},
49 | kubeletplugin.KubeClient(d.kubeclient),
50 | kubeletplugin.NodeName(d.nodename),
51 | kubeletplugin.DriverName(driverName),
52 | kubeletplugin.RegistrarSocketPath(pluginRegistrationPath),
53 | kubeletplugin.PluginSocketPath(driverPluginSocketPath),
54 | kubeletplugin.KubeletPluginSocketPath(driverPluginSocketPath))
55 | if err != nil {
56 | return fmt.Errorf("failed to start kubelet plugin: %v", err)
57 | }
58 |
59 | d.plugin = plugin
60 |
61 | if err := d.UpdateDeviceResources(ctx); err != nil {
62 | return fmt.Errorf("failed to publish resources: %v", err)
63 | }
64 |
65 | klog.Infof("DRA kubelet plugin %s running...", driverName)
66 |
67 | sigc := make(chan os.Signal, 1)
68 | signal.Notify(sigc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
69 | <-sigc
70 |
71 | plugin.Stop()
72 |
73 | klog.Infof("DRA kubelet plugin %s done", driverName)
74 |
75 | return nil
76 | }
77 |
78 | func setupCmd() (*cobra.Command, error) {
79 | cmd := &cobra.Command{
80 | Use: "kubelet-plugin",
81 | Short: "Intel QAT resource driver kubelet plugin",
82 | RunE: cmdRun,
83 | }
84 |
85 | logsconfig := logsapi.NewLoggingConfiguration()
86 | fgate := featuregate.NewFeatureGate()
87 | utilruntime.Must(logsapi.AddFeatureGates(fgate))
88 | if err := logsapi.ValidateAndApply(logsconfig, fgate); err != nil {
89 | return nil, err
90 | }
91 |
92 | loggingFlags := cliflag.NamedFlagSets{}
93 | fs := loggingFlags.FlagSet("logging")
94 | logsapi.AddFlags(logsconfig, fs)
95 | logs.AddFlags(fs, logs.SkipLoggingConfigurationFlags())
96 |
97 | cmd.PersistentFlags().AddFlagSet(fs)
98 |
99 | cols, _, _ := term.TerminalSize(cmd.OutOrStdout())
100 | cliflag.SetUsageAndHelpFunc(cmd, loggingFlags, cols)
101 |
102 | return cmd, nil
103 | }
104 |
105 | func main() {
106 | cmd, err := setupCmd()
107 | if err != nil {
108 | fmt.Printf("Error: failed to start: %v", err)
109 | return
110 | }
111 |
112 | // Execute() already prints out the error.
113 | _ = cmd.Execute()
114 | }
115 |
--------------------------------------------------------------------------------
/cmd/qat-showdevice/main.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package main
6 |
7 | import (
8 | "fmt"
9 |
10 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device"
11 | )
12 |
13 | func printPFDevice(pfdev *device.PFDevice) {
14 | fmt.Printf("PF device: %s\n", pfdev.Device)
15 | fmt.Printf("State: %s\n", pfdev.State.String())
16 | fmt.Printf("Services: %s\n", pfdev.Services.String())
17 | fmt.Printf("Num VFs: %d\n", pfdev.NumVFs)
18 | fmt.Printf("Max VFs: %d\n", pfdev.TotalVFs)
19 |
20 | for _, vfdev := range pfdev.AvailableDevices {
21 | fmt.Printf("\tVF UID %s: device %s, device node %s, IOMMU %s, driver %s\n", vfdev.UID(), vfdev.PCIDevice(), vfdev.DeviceNode(), vfdev.Iommu(), vfdev.Driver())
22 | }
23 | }
24 |
25 | func main() {
26 | pfdevices, err := device.New()
27 | if err != nil {
28 | fmt.Printf("Error: %v\n", err)
29 | return
30 | }
31 |
32 | if len(pfdevices) == 0 {
33 | fmt.Printf("No PF devices found\n")
34 | return
35 | }
36 |
37 | for _, pfdev := range pfdevices {
38 | printPFDevice(pfdev)
39 | fmt.Printf("---\n\n")
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/deployments/gaudi/base/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: gaudi.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "gaudi.intel.com"
10 |
--------------------------------------------------------------------------------
/deployments/gaudi/base/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - device-class.yaml
3 | - namespace.yaml
4 | - resource-driver.yaml
5 |
6 | images:
7 | - name: intel/intel-gaudi-resource-driver
8 | newTag: v0.3.0
9 |
--------------------------------------------------------------------------------
/deployments/gaudi/base/namespace.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Namespace
4 | metadata:
5 | name: intel-gaudi-resource-driver
6 |
--------------------------------------------------------------------------------
/deployments/gaudi/base/resource-driver.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: DaemonSet
4 | metadata:
5 | name: intel-gaudi-resource-driver-kubelet-plugin
6 | namespace: intel-gaudi-resource-driver
7 | labels:
8 | app: intel-gaudi-resource-driver-kubelet-plugin
9 | spec:
10 | selector:
11 | matchLabels:
12 | app: intel-gaudi-resource-driver-kubelet-plugin
13 | template:
14 | metadata:
15 | labels:
16 | app: intel-gaudi-resource-driver-kubelet-plugin
17 | spec:
18 | serviceAccount: intel-gaudi-resource-driver-service-account
19 | serviceAccountName: intel-gaudi-resource-driver-service-account
20 | containers:
21 | - name: kubelet-plugin
22 | image: intel/intel-gaudi-resource-driver:v0.3.0
23 | imagePullPolicy: IfNotPresent
24 | command: ["/kubelet-gaudi-plugin", "-m"]
25 | env:
26 | - name: NODE_NAME
27 | valueFrom:
28 | fieldRef:
29 | fieldPath: spec.nodeName
30 | - name: POD_NAMESPACE
31 | valueFrom:
32 | fieldRef:
33 | fieldPath: metadata.namespace
34 | - name: SYSFS_ROOT
35 | value: "/sys"
36 | # Only use DEVFS_ROOT when using fake devfs with device-faker
37 | #- name: DEVFS_ROOT
38 | # value: "/devfs"
39 |
40 | volumeMounts:
41 | - name: plugins-registry
42 | mountPath: /var/lib/kubelet/plugins_registry
43 | - name: plugins
44 | mountPath: /var/lib/kubelet/plugins
45 | - name: cdi
46 | mountPath: /etc/cdi
47 | - name: varruncdi
48 | mountPath: /var/run/cdi
49 | - name: sysfs
50 | mountPath: "/sys"
51 | # Only use DEVFS_ROOT when using fake devfs with device-faker
52 | #- name: devfs
53 | # mountPath: "/devfs"
54 | securityContext:
55 | privileged: true
56 | capabilities:
57 | drop: [ "ALL" ]
58 | readOnlyRootFilesystem: true
59 | runAsUser: 0
60 | seccompProfile:
61 | type: RuntimeDefault
62 | volumes:
63 | - name: plugins-registry
64 | hostPath:
65 | path: /var/lib/kubelet/plugins_registry
66 | - name: plugins
67 | hostPath:
68 | path: /var/lib/kubelet/plugins
69 | - name: cdi
70 | hostPath:
71 | path: /etc/cdi
72 | - name: varruncdi
73 | hostPath:
74 | path: /var/run/cdi
75 | - name: sysfs
76 | hostPath:
77 | path: /sys
78 | # Only use DEVFS_ROOT when using fake devfs with device-faker
79 | #- name: devfs
80 | # hostPath:
81 | # path: /dev
82 |
83 | ---
84 | apiVersion: v1
85 | kind: ServiceAccount
86 | metadata:
87 | name: intel-gaudi-resource-driver-service-account
88 | namespace: intel-gaudi-resource-driver
89 |
90 | ---
91 | apiVersion: rbac.authorization.k8s.io/v1
92 | kind: ClusterRole
93 | metadata:
94 | name: intel-gaudi-resource-driver-role
95 | namespace: intel-gaudi-resource-driver
96 | rules:
97 | - apiGroups: [""]
98 | resources: ["nodes"]
99 | verbs: ["get"]
100 | - apiGroups: ["resource.k8s.io"]
101 | resources: ["resourceslices"]
102 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
103 | - apiGroups: ["resource.k8s.io"]
104 | resources: ["resourceclaims"]
105 | verbs: ["get"]
106 |
107 | ---
108 | apiVersion: rbac.authorization.k8s.io/v1
109 | kind: ClusterRoleBinding
110 | metadata:
111 | name: intel-gaudi-resource-driver-role-binding
112 | namespace: intel-gaudi-resource-driver
113 | subjects:
114 | - kind: ServiceAccount
115 | name: intel-gaudi-resource-driver-service-account
116 | namespace: intel-gaudi-resource-driver
117 | roleRef:
118 | kind: ClusterRole
119 | name: intel-gaudi-resource-driver-role
120 | apiGroup: rbac.authorization.k8s.io
121 |
122 | ---
123 | apiVersion: admissionregistration.k8s.io/v1
124 | kind: ValidatingAdmissionPolicy
125 | metadata:
126 | name: resourceslices-policy-dra-kubelet-plugin-gaudi
127 | spec:
128 | failurePolicy: Fail
129 | matchConstraints:
130 | resourceRules:
131 | - apiGroups: ["resource.k8s.io"]
132 | apiVersions: ["v1beta1"]
133 | operations: ["CREATE", "UPDATE", "DELETE"]
134 | resources: ["resourceslices"]
135 | matchConditions:
136 | - name: isRestrictedUser
137 | expression: >-
138 | request.userInfo.username == "system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account"
139 | variables:
140 | - name: userNodeName
141 | expression: >-
142 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
143 | - name: objectNodeName
144 | expression: >-
145 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
146 | validations:
147 | - expression: variables.userNodeName != ""
148 | message: >-
149 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
150 | - expression: variables.userNodeName == variables.objectNodeName
151 | messageExpression: >-
152 | "this user running on node '"+variables.userNodeName+"' may not modify " +
153 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
154 | ---
155 | apiVersion: admissionregistration.k8s.io/v1
156 | kind: ValidatingAdmissionPolicyBinding
157 | metadata:
158 | name: resourceslices-policy-dra-kubelet-plugin-gaudi
159 | spec:
160 | policyName: resourceslices-policy-dra-kubelet-plugin-gaudi
161 | validationActions: [Deny]
162 |
--------------------------------------------------------------------------------
/deployments/gaudi/examples/deployment-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: two-gaudi3
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: gaudi
10 | deviceClassName: gaudi.intel.com
11 | count: 2
12 | selectors:
13 | - cel:
14 | expression: device.attributes["gaudi.intel.com"].model == 'Gaudi3'
15 |
16 | ---
17 | apiVersion: apps/v1
18 | kind: Deployment
19 | metadata:
20 | name: gaudi-test
21 | labels:
22 | app: inline-gpu-deployment
23 | spec:
24 | replicas: 1
25 | selector:
26 | matchLabels:
27 | app: inline-gpu-deployment
28 | template:
29 | metadata:
30 | labels:
31 | app: inline-gpu-deployment
32 | spec:
33 | containers:
34 | - name: with-resource
35 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
36 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 300"]
37 | resources:
38 | claims:
39 | - name: resource
40 | - name: without-resource
41 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
42 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"]
43 | resourceClaims:
44 | - name: resource
45 | resourceClaimTemplateName: two-gaudi3
46 |
--------------------------------------------------------------------------------
/deployments/gaudi/examples/monitor-pod-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: monitor-claim
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: gaudi
10 | deviceClassName: gaudi.intel.com
11 | adminAccess: true
12 | allocationMode: "All"
13 | ---
14 | apiVersion: v1
15 | kind: Pod
16 | metadata:
17 | name: monitor-pod
18 | spec:
19 | restartPolicy: Never
20 | containers:
21 | - name: monitor
22 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
23 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"]
24 | resources:
25 | claims:
26 | - name: resource
27 | resourceClaims:
28 | - name: resource
29 | resourceClaimTemplateName: monitor-claim
30 |
--------------------------------------------------------------------------------
/deployments/gaudi/examples/pod-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaim
3 | metadata:
4 | name: claim1
5 | spec:
6 | devices:
7 | requests:
8 | - name: gaudi
9 | deviceClassName: gaudi.intel.com
10 | ##
11 | ## if one is not enough
12 | # count: 2
13 | ##
14 | ## requesting particular series
15 | # selectors:
16 | # - cel:
17 | # expression: device.attributes["gaudi.intel.com"].model == 'Gaudi2'
18 | ##
19 | ## for monitoring
20 | # adminAccess: true
21 | # allocationMode: "All"
22 | ---
23 | apiVersion: v1
24 | kind: Pod
25 | metadata:
26 | name: test-inline-claim
27 | spec:
28 | restartPolicy: Never
29 | containers:
30 | - name: with-resource
31 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
32 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 60"]
33 | resources:
34 | claims:
35 | - name: resource
36 | - name: without-resource
37 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
38 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"]
39 | resourceClaims:
40 | - name: resource
41 | resourceClaimName: claim1
42 |
--------------------------------------------------------------------------------
/deployments/gaudi/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - base
3 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/device-faker/device-faker.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gaudi-resource-driver-kubelet-plugin
5 | namespace: intel-gaudi-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | initContainers:
10 | - name: device-faker
11 | image: ger-is-registry.caas.intel.com/dgpu-orchestration/intel-device-faker:v0.1.0
12 | imagePullPolicy: Always
13 | command: ["/device-faker", "gaudi", "-t", "/opt/templates/gaudi-template.json", "-d", "/tmp/fake-root"]
14 | volumeMounts:
15 | - name: fake-root
16 | mountPath: /tmp/fake-root
17 | containers:
18 | - name: kubelet-plugin
19 | env:
20 | - name: SYSFS_ROOT
21 | value: "/fake-sysfs"
22 | volumeMounts:
23 | - name: fake-root
24 | mountPath: /fake-sysfs
25 | subPath: sysfs
26 | - name: fake-root
27 | mountPath: /fake-dev/dri
28 | subPath: dev/dri
29 | - name: fake-root
30 | mountPath: /fake-cdi
31 | subPath: cdi
32 | volumes:
33 | - name: fake-root
34 | emptyDir: {}
35 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/device-faker/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - ../../base
3 |
4 | patches:
5 | - path: remove-sysfs.yaml
6 | - path: device-faker.yaml
7 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/device-faker/remove-sysfs.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gaudi-resource-driver-kubelet-plugin
5 | namespace: intel-gaudi-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | containers:
10 | - name: kubelet-plugin
11 | volumeMounts:
12 | - name: sysfs
13 | mountPath: /sysfs
14 | $patch: delete
15 | volumes:
16 | - name: sysfs
17 | $patch: delete
18 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/nfd_labeled_nodes/add-nodeselector-intel-gaudi.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gaudi-resource-driver-kubelet-plugin
5 | namespace: intel-gaudi-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | nodeSelector:
10 | intel.feature.node.kubernetes.io/gaudi: "true"
11 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/nfd_labeled_nodes/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - ../../base
6 | - nfd-intel-gaudi-device-rule.yaml
7 |
8 | patches:
9 | - path: add-nodeselector-intel-gaudi.yaml
10 |
--------------------------------------------------------------------------------
/deployments/gaudi/overlays/nfd_labeled_nodes/nfd-intel-gaudi-device-rule.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: nfd.k8s-sigs.io/v1alpha1
2 | kind: NodeFeatureRule
3 | metadata:
4 | name: intel-gaudi-device-rule
5 | spec:
6 | rules:
7 | - name: "intel.gaudi"
8 | labels:
9 | "intel.feature.node.kubernetes.io/gaudi": "true"
10 | matchFeatures:
11 | - feature: pci.device
12 | matchExpressions:
13 | vendor: {op: In, value: ["1da3"]}
14 | device: {op: In, value: ["1020", "1030"]}
15 |
--------------------------------------------------------------------------------
/deployments/gpu/base/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: gpu.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "gpu.intel.com"
10 |
--------------------------------------------------------------------------------
/deployments/gpu/base/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - device-class.yaml
3 | - namespace.yaml
4 | - resource-driver.yaml
5 |
6 | images:
7 | - name: intel/intel-gpu-resource-driver
8 | newTag: v0.7.0
9 |
--------------------------------------------------------------------------------
/deployments/gpu/base/namespace.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Namespace
4 | metadata:
5 | name: intel-gpu-resource-driver
6 |
--------------------------------------------------------------------------------
/deployments/gpu/examples/claim-external-gpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaim
3 | metadata:
4 | name: one-flex
5 | spec:
6 | devices:
7 | requests:
8 | - name: gpu
9 | deviceClassName: gpu.intel.com
10 | selectors:
11 | - cel:
12 | expression: device.attributes["gpu.intel.com"].family == 'Flex'
13 |
--------------------------------------------------------------------------------
/deployments/gpu/examples/deployment-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: gpu-4g
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: gpu
10 | deviceClassName: gpu.intel.com
11 | selectors:
12 | - cel:
13 | expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0
14 |
15 | ---
16 | apiVersion: apps/v1
17 | kind: Deployment
18 | metadata:
19 | name: gpu-test
20 | labels:
21 | app: inline-gpu-deployment
22 | spec:
23 | replicas: 1
24 | selector:
25 | matchLabels:
26 | app: inline-gpu-deployment
27 | template:
28 | metadata:
29 | labels:
30 | app: inline-gpu-deployment
31 | spec:
32 | containers:
33 | - name: with-resource
34 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
35 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 300"]
36 | resources:
37 | claims:
38 | - name: resource
39 | - name: without-resource
40 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
41 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"]
42 | resourceClaims:
43 | - name: resource
44 | resourceClaimTemplateName: gpu-4g
45 |
--------------------------------------------------------------------------------
/deployments/gpu/examples/monitor-pod-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: monitor-claim
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: gpu
10 | deviceClassName: gpu.intel.com
11 | adminAccess: true
12 | allocationMode: "All"
13 | ---
14 | apiVersion: v1
15 | kind: Pod
16 | metadata:
17 | name: monitor-pod
18 | spec:
19 | restartPolicy: Never
20 | containers:
21 | - name: monitor
22 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
23 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"]
24 | resources:
25 | claims:
26 | - name: resource
27 | resourceClaims:
28 | - name: resource
29 | resourceClaimTemplateName: monitor-claim
30 |
--------------------------------------------------------------------------------
/deployments/gpu/examples/pod-for-claim-external-gpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: test-one-flex
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: with-resource
9 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
10 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"]
11 | resources:
12 | claims:
13 | - name: resource
14 | - name: without-resource
15 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
16 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"]
17 | resourceClaims:
18 | - name: resource
19 | resourceClaimName: one-flex
20 |
--------------------------------------------------------------------------------
/deployments/gpu/examples/pod-inline-gpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: claim1
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: gpu
10 | deviceClassName: gpu.intel.com
11 | ##
12 | ## if one is not enough
13 | # count: 2
14 | ##
15 | ## requesting particular series
16 | # selectors:
17 | # - cel:
18 | # expression: device.attributes["gpu.intel.com"].family == 'Flex'
19 | # - cel:
20 | # expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0
21 |
22 | ## for monitoring
23 | # adminAccess: true
24 | # allocationMode: "All"
25 | ---
26 | apiVersion: v1
27 | kind: Pod
28 | metadata:
29 | name: test-inline-claim
30 | spec:
31 | restartPolicy: Never
32 | containers:
33 | - name: with-resource
34 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
35 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"]
36 | resources:
37 | claims:
38 | - name: resource
39 | - name: without-resource
40 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
41 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"]
42 | resourceClaims:
43 | - name: resource
44 | resourceClaimTemplateName: claim1
45 |
--------------------------------------------------------------------------------
/deployments/gpu/intel-xpumanager/gpu-monitor-claim.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: intel-gpu-monitor-claim
5 | spec:
6 | metadata:
7 | labels:
8 | app: intel-gpu-monitor-claim
9 | spec:
10 | resourceClassName: intel-gpu-monitor
11 |
--------------------------------------------------------------------------------
/deployments/gpu/intel-xpumanager/kustomization.yaml:
--------------------------------------------------------------------------------
1 | namespace: monitoring
2 | resources:
3 | - https://github.com/intel/xpumanager/deployment/kubernetes/daemonset/base/?ref=V1.2.39
4 | - gpu-monitor-claim.yaml
5 | patches:
6 | - path: xpumd-delete-limits.yaml
7 | target:
8 | kind: DaemonSet
9 | - path: xpumd-add-dra-resource.yaml
10 | target:
11 | kind: DaemonSet
12 |
--------------------------------------------------------------------------------
/deployments/gpu/intel-xpumanager/xpumd-add-dra-resource.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-xpumanager
5 | spec:
6 | template:
7 | spec:
8 | resourceClaims:
9 | - name: intel-gpu-resource
10 | source:
11 | resourceClaimTemplateName: intel-gpu-monitor-claim
12 | containers:
13 | - name: xpumd
14 | resources:
15 | claims:
16 | - name: intel-gpu-resource
17 |
--------------------------------------------------------------------------------
/deployments/gpu/intel-xpumanager/xpumd-delete-limits.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-xpumanager
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: xpumd
10 | resources:
11 | limits:
12 | # gpu.intel.com/i915_monitoring: 1
13 | $patch: delete
14 |
--------------------------------------------------------------------------------
/deployments/gpu/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - base
3 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/device-faker/device-faker.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gpu-resource-driver-kubelet-plugin
5 | namespace: intel-gpu-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | initContainers:
10 | - name: device-faker
11 | image: ger-is-registry.caas.intel.com/dgpu-orchestration/intel-device-faker:v0.1.0
12 | imagePullPolicy: Always
13 | command: ["/device-faker", "gpu", "-t", "/opt/templates/gpu-template.json", "-d", "/tmp/fake-root"]
14 | volumeMounts:
15 | - name: fake-root
16 | mountPath: /tmp/fake-root
17 | containers:
18 | - name: kubelet-plugin
19 | env:
20 | - name: SYSFS_ROOT
21 | value: "/fake-sysfs"
22 | volumeMounts:
23 | - name: fake-root
24 | mountPath: /fake-sysfs
25 | subPath: sysfs
26 | - name: fake-root
27 | mountPath: /fake-dev/dri
28 | subPath: dev/dri
29 | - name: fake-root
30 | mountPath: /fake-cdi
31 | subPath: cdi
32 | volumes:
33 | - name: fake-root
34 | emptyDir: {}
35 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/device-faker/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - ../../base
3 |
4 | patches:
5 | - path: remove-sysfs.yaml
6 | - path: device-faker.yaml
7 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/device-faker/remove-sysfs.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gpu-resource-driver-kubelet-plugin
5 | namespace: intel-gpu-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | containers:
10 | - name: kubelet-plugin
11 | volumeMounts:
12 | - name: sysfs
13 | mountPath: /sysfs
14 | $patch: delete
15 | volumes:
16 | - name: sysfs
17 | $patch: delete
18 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/nfd_labeled_nodes/add-nodeselector-intel-gpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-gpu-resource-driver-kubelet-plugin
5 | namespace: intel-gpu-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | nodeSelector:
10 | intel.feature.node.kubernetes.io/gpu: "true"
11 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/nfd_labeled_nodes/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - ../../base
6 | - nfd-intel-gpu-device-rule.yaml
7 | - nfd-intel-gpu-platform-labeling.yaml
8 |
9 | patches:
10 | - path: add-nodeselector-intel-gpu.yaml
11 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-device-rule.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: nfd.k8s-sigs.io/v1alpha1
2 | kind: NodeFeatureRule
3 | metadata:
4 | name: intel-gpu-device-rule
5 | spec:
6 | rules:
7 | - name: intel.gpu.device
8 | labels:
9 | "intel.feature.node.kubernetes.io/gpu": "true"
10 | matchFeatures:
11 | - feature: pci.device
12 | matchExpressions:
13 | vendor: {op: In, value: ["8086"]}
14 | class: {op: In, value: ["0300", "0380"]}
15 |
--------------------------------------------------------------------------------
/deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-platform-labeling.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: nfd.k8s-sigs.io/v1alpha1
2 | kind: NodeFeatureRule
3 | metadata:
4 | name: intel-gpu-platform-labeling
5 | spec:
6 | rules:
7 | # A_Series (Alchemist)
8 | - labels:
9 | gpu.intel.com/family: "A_Series"
10 | matchFeatures:
11 | - feature: pci.device
12 | matchExpressions:
13 | class: {op: In, value: ["0300"]}
14 | vendor: {op: In, value: ["8086"]}
15 | device:
16 | op: In
17 | value:
18 | - "56a6"
19 | - "56a5"
20 | - "56a1"
21 | - "56a0"
22 | - "5694"
23 | - "5693"
24 | - "5692"
25 | - "5691"
26 | - "5690"
27 | - "56b3"
28 | - "56b2"
29 | - "56a4"
30 | - "56a3"
31 | - "5697"
32 | - "5696"
33 | - "5695"
34 | - "56b1"
35 | - "56b0"
36 | name: intel.gpu.a.series
37 | # Max_Series
38 | - labels:
39 | gpu.intel.com/family: "Max_Series"
40 | matchFeatures:
41 | - feature: pci.device
42 | matchExpressions:
43 | class: {op: In, value: ["0380"]}
44 | vendor: {op: In, value: ["8086"]}
45 | device:
46 | op: In
47 | value:
48 | - "0bda"
49 | - "0bd5"
50 | - "0bd9"
51 | - "0bdb"
52 | - "0bd7"
53 | - "0bd6"
54 | - "0bd0"
55 | name: intel.gpu.max.series
56 | # Flex_Series
57 | - labels:
58 | gpu.intel.com/family: "Flex_Series"
59 | matchFeatures:
60 | - feature: pci.device
61 | matchExpressions:
62 | class: {op: In, value: ["0300", "0380"]}
63 | vendor: {op: In, value: ["8086"]}
64 | device:
65 | op: In
66 | value:
67 | - "0f00"
68 | - "0f01"
69 | - "0f02"
70 | name: intel.gpu.flex.series
71 |
--------------------------------------------------------------------------------
/deployments/qat/base/device-class.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 | name: qat.intel.com
5 |
6 | spec:
7 | selectors:
8 | - cel:
9 | expression: device.driver == "qat.intel.com"
10 |
--------------------------------------------------------------------------------
/deployments/qat/base/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - device-class.yaml
3 | - namespace.yaml
4 | - resource-driver.yaml
5 |
6 | images:
7 | - name: intel/intel-qat-resource-driver
8 | newTag: v0.2.0
9 |
--------------------------------------------------------------------------------
/deployments/qat/base/namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: intel-qat-resource-driver
5 |
--------------------------------------------------------------------------------
/deployments/qat/base/resource-driver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-qat-resource-driver-kubelet-plugin
5 | namespace: intel-qat-resource-driver
6 | labels:
7 | app: intel-qat-resource-driver-kubelet-plugin
8 | spec:
9 | selector:
10 | matchLabels:
11 | app: intel-qat-resource-driver-kubelet-plugin
12 | template:
13 | metadata:
14 | labels:
15 | app: intel-qat-resource-driver-kubelet-plugin
16 | spec:
17 | serviceAccount: intel-qat-resource-driver-service-account
18 | serviceAccountName: intel-qat-resource-driver-service-account
19 | containers:
20 | - name: kubelet-plugin
21 | image: intel/intel-qat-resource-driver:v0.1.0
22 | imagePullPolicy: IfNotPresent
23 | command: ["/kubelet-qat-plugin"]
24 | env:
25 | - name: NODE_NAME
26 | valueFrom:
27 | fieldRef:
28 | fieldPath: spec.nodeName
29 | - name: SYSFS_ROOT
30 | value: "/sysfs"
31 | volumeMounts:
32 | - name: plugins-registry
33 | mountPath: /var/lib/kubelet/plugins_registry
34 | - name: plugins
35 | mountPath: /var/lib/kubelet/plugins
36 | - name: cdi
37 | mountPath: /etc/cdi
38 | - name: varruncdi
39 | mountPath: /var/run/cdi
40 | - name: sysfs
41 | mountPath: /sysfs
42 | - name: qatconfiguration
43 | mountPath: /defaults
44 | securityContext:
45 | privileged: true
46 | readOnlyRootFilesystem: true
47 | seccompProfile:
48 | type: RuntimeDefault
49 | volumes:
50 | - name: plugins-registry
51 | hostPath:
52 | path: /var/lib/kubelet/plugins_registry
53 | - name: plugins
54 | hostPath:
55 | path: /var/lib/kubelet/plugins
56 | - name: cdi
57 | hostPath:
58 | path: /etc/cdi
59 | - name: varruncdi
60 | hostPath:
61 | path: /var/run/cdi
62 | - name: sysfs
63 | hostPath:
64 | path: /sys
65 | - name: qatconfiguration
66 | configMap:
67 | name: intel-qat-resource-driver-configuration
68 | optional: true
69 |
70 | ---
71 | apiVersion: v1
72 | kind: ServiceAccount
73 | metadata:
74 | name: intel-qat-resource-driver-service-account
75 | namespace: intel-qat-resource-driver
76 |
77 | ---
78 | apiVersion: rbac.authorization.k8s.io/v1
79 | kind: ClusterRole
80 | metadata:
81 | name: intel-qat-resource-driver-role
82 | namespace: intel-qat-resource-driver
83 | rules:
84 | - apiGroups: [""]
85 | resources: ["nodes"]
86 | verbs: ["get"]
87 | - apiGroups: ["resource.k8s.io"]
88 | resources: ["resourceslices"]
89 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
90 | - apiGroups: ["resource.k8s.io"]
91 | resources: ["resourceclaims"]
92 | verbs: ["get"]
93 |
94 | ---
95 | apiVersion: rbac.authorization.k8s.io/v1
96 | kind: ClusterRoleBinding
97 | metadata:
98 | name: intel-qat-resource-driver-role-binding
99 | namespace: intel-qat-resource-driver
100 | subjects:
101 | - kind: ServiceAccount
102 | name: intel-qat-resource-driver-service-account
103 | namespace: intel-qat-resource-driver
104 | roleRef:
105 | kind: ClusterRole
106 | name: intel-qat-resource-driver-role
107 | apiGroup: rbac.authorization.k8s.io
108 | ---
109 | apiVersion: admissionregistration.k8s.io/v1
110 | kind: ValidatingAdmissionPolicy
111 | metadata:
112 | name: resourceslices-policy-dra-kubelet-plugin-qat
113 | spec:
114 | failurePolicy: Fail
115 | matchConstraints:
116 | resourceRules:
117 | - apiGroups: ["resource.k8s.io"]
118 | apiVersions: ["v1beta1"]
119 | operations: ["CREATE", "UPDATE", "DELETE"]
120 | resources: ["resourceslices"]
121 | matchConditions:
122 | - name: isRestrictedUser
123 | expression: >-
124 | request.userInfo.username == "system:serviceaccount:intel-qat-resource-driver:intel-qat-resource-driver-service-account"
125 | variables:
126 | - name: userNodeName
127 | expression: >-
128 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
129 | - name: objectNodeName
130 | expression: >-
131 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
132 | validations:
133 | - expression: variables.userNodeName != ""
134 | message: >-
135 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
136 | - expression: variables.userNodeName == variables.objectNodeName
137 | messageExpression: >-
138 | "this user running on node '"+variables.userNodeName+"' may not modify " +
139 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
140 | ---
141 | apiVersion: admissionregistration.k8s.io/v1
142 | kind: ValidatingAdmissionPolicyBinding
143 | metadata:
144 | name: resourceslices-policy-dra-kubelet-plugin-qat
145 | spec:
146 | policyName: resourceslices-policy-dra-kubelet-plugin-qat
147 | validationActions: [Deny]
148 |
--------------------------------------------------------------------------------
/deployments/qat/examples/deployment-inline.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: qat-template-sym
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: qat-request-sym
10 | deviceClassName: qat.intel.com
11 | selectors:
12 | - cel:
13 | expression: |-
14 | device.attributes["qat.intel.com"].services == "sym" ||
15 | device.attributes["qat.intel.com"].services == "sym;asym" ||
16 | device.attributes["qat.intel.com"].services == "sym;dc" ||
17 | device.attributes["qat.intel.com"].services == "asym;sym" ||
18 | device.attributes["qat.intel.com"].services == "dc;sym" ||
19 |
20 | ---
21 | apiVersion: resource.k8s.io/v1beta1
22 | kind: ResourceClaimTemplate
23 | metadata:
24 | name: qat-template-asym
25 | spec:
26 | spec:
27 | devices:
28 | requests:
29 | - name: qat-request-asym
30 | deviceClassName: qat.intel.com
31 | selectors:
32 | - cel:
33 | expression: |-
34 | device.attributes["qat.intel.com"].services == "asym" ||
35 | device.attributes["qat.intel.com"].services == "asym;sym" ||
36 | device.attributes["qat.intel.com"].services == "asym;dc" ||
37 | device.attributes["qat.intel.com"].services == "sym;asym" ||
38 | device.attributes["qat.intel.com"].services == "dc;asym" ||
39 |
40 | ---
41 | apiVersion: resource.k8s.io/v1beta1
42 | kind: ResourceClaimTemplate
43 | metadata:
44 | name: qat-template-dc
45 | spec:
46 | spec:
47 | devices:
48 | requests:
49 | - name: qat-request-dc
50 | deviceClassName: qat.intel.com
51 | selectors:
52 | - cel:
53 | expression: |-
54 | device.attributes["qat.intel.com"].services == "dc" ||
55 | device.attributes["qat.intel.com"].services == "dc;sym" ||
56 | device.attributes["qat.intel.com"].services == "dc;asym" ||
57 | device.attributes["qat.intel.com"].services == "sym;dc" ||
58 | device.attributes["qat.intel.com"].services == "asym;dc" ||
59 | device.attributes["qat.intel.com"].services == "dcc"
60 |
61 | ---
62 | apiVersion: v1
63 | kind: Deployment
64 | metadata:
65 | name: qat-sample-sym
66 | labels:
67 | app: inline-qat-deployment
68 | spec:
69 | replicas: 1
70 | selector:
71 | matchLabels:
72 | app: inline-qat-deployment
73 | template:
74 | metadata:
75 | labels:
76 | app: inline-qat-deployment
77 | spec:
78 | containers:
79 | - name: with-resource
80 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
81 | command: ["sh", "-c", "ls -la /dev/vfio/ && sleep 300"]
82 | securityContext:
83 | capabilities:
84 | add:
85 | ["IPC_LOCK"]
86 | resources:
87 | claims:
88 | - name: resource-sym
89 | - name: resource-asym
90 | - name: resource-dc
91 | - name: without-resource
92 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
93 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"]
94 | resourceClaims:
95 | - name: resource-sym
96 | resourceClaimTemplateName: qat-template-sym
97 | - name: resource-asym
98 | resourceClaimTemplateName: qat-template-asym
99 | - name: resource-dc
100 | resourceClaimTemplateName: qat-template-dc
101 |
--------------------------------------------------------------------------------
/deployments/qat/examples/intel-qat-resource-driver-configuration.yaml:
--------------------------------------------------------------------------------
1 | kind: ConfigMap
2 | apiVersion: v1
3 | metadata:
4 | name: intel-qat-resource-driver-configuration
5 | namespace: intel-qat-resource-driver
6 | data:
7 | # Map of : in map indexed by hostname
8 | qatdefaults.config: |
9 | { "host-name-here":
10 | {
11 | "0000:aa:00.0": "asym;sym",
12 | "0000:bb:00.0": "dc;sym"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/deployments/qat/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - base
3 |
--------------------------------------------------------------------------------
/deployments/qat/overlays/nfd_labeled_nodes/add-nodeselector-intel-qat.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: intel-qat-resource-driver-kubelet-plugin
5 | namespace: intel-qat-resource-driver
6 | spec:
7 | template:
8 | spec:
9 | nodeSelector:
10 | intel.feature.node.kubernetes.io/qat: "true"
11 |
--------------------------------------------------------------------------------
/deployments/qat/overlays/nfd_labeled_nodes/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - ../../base
6 | - nfd-intel-qat-device-rule.yaml
7 |
8 | patches:
9 | - path: add-nodeselector-intel-qat.yaml
10 |
--------------------------------------------------------------------------------
/deployments/qat/overlays/nfd_labeled_nodes/nfd-intel-qat-device-rule.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: nfd.k8s-sigs.io/v1alpha1
2 | kind: NodeFeatureRule
3 | metadata:
4 | name: intel-qat-device-rule
5 | spec:
6 | rules:
7 | - name: "intel.qat"
8 | labels:
9 | feature.node.kubernetes.io/qat: "true"
10 | matchFeatures:
11 | - feature: pci.device
12 | matchExpressions:
13 | vendor: {op: In, value: ["8086"]}
14 | device: {op: In, value: ["4940", "4941", "4944", "4946"]}
15 | class: {op: In, value: ["0b40"]}
16 | - feature: kernel.loadedmodule
17 | matchExpressions:
18 | intel_qat: {op: Exists}
19 | matchAny:
20 | - matchFeatures:
21 | - feature: kernel.loadedmodule
22 | matchExpressions:
23 | vfio_pci: {op: Exists}
24 | - matchFeatures:
25 | - feature: kernel.enabledmodule
26 | matchExpressions:
27 | vfio-pci: {op: Exists}
28 |
--------------------------------------------------------------------------------
/deployments/qat/tests/openssl-qat-engine/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - openssl-qat-engine.yaml
3 |
4 | apiVersion: kustomize.config.k8s.io/v1beta1
5 | kind: Kustomization
6 | images:
7 | - name: openssl-qat-engine:devel
8 | newName: intel/openssl-qat-engine
9 | newTag: devel
10 |
--------------------------------------------------------------------------------
/deployments/qat/tests/openssl-qat-engine/openssl-qat-engine.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: openssl-qat-engine-asym
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: openssl-qat-engine-asym
9 | image: openssl-qat-engine:devel
10 | imagePullPolicy: IfNotPresent
11 | command: ["testapp","-engine","qathwtest","-async_jobs","1","-c","1","-n","1","-nc","1","-v","-hw_algo","0x0029"]
12 | securityContext:
13 | readOnlyRootFilesystem: true
14 | allowPrivilegeEscalation: false
15 | capabilities:
16 | add:
17 | ["IPC_LOCK"]
18 | resources:
19 | claims:
20 | - name: qat-resource-asym
21 | resourceClaims:
22 | - name: qat-resource-asym
23 | resourceClaimTemplateName: qat-template-asym
24 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qat-dpdk-test/compress-perf.yaml:
--------------------------------------------------------------------------------
1 | kind: Pod
2 | apiVersion: v1
3 | metadata:
4 | name: qat-dpdk-test-compress-perf
5 | spec:
6 | containers:
7 | - name: compress-perf
8 | image: crypto-perf:devel
9 | imagePullPolicy: IfNotPresent
10 | env:
11 | - name: TESTCMD
12 | value: "compress"
13 | - name: PTEST
14 | value: "--driver-name compress_qat --input-file /var/data/file.txt --seg-sz 8192 --compress-level 1:1:9 --num-iter 10 --extended-input-sz 1048576 --max-num-sgl-segs 16 --huffman-enc fixed"
15 | volumeMounts:
16 | - mountPath: /dev/hugepages
17 | name: hugepage
18 | - mountPath: /var/run/dpdk
19 | name: dpdk-runtime
20 | - mountPath: /var/data/
21 | name: testfile
22 | resources:
23 | claims:
24 | - name: qat-resource-dc
25 | requests:
26 | cpu: "3"
27 | memory: "128Mi"
28 | hugepages-2Mi: "128Mi"
29 | limits:
30 | cpu: "3"
31 | memory: "128Mi"
32 | hugepages-2Mi: "128Mi"
33 | securityContext:
34 | readOnlyRootFilesystem: true
35 | allowPrivilegeEscalation: false
36 | capabilities:
37 | add:
38 | ["IPC_LOCK"]
39 | restartPolicy: Never
40 | volumes:
41 | - name: dpdk-runtime
42 | emptyDir:
43 | medium: Memory
44 | - name: hugepage
45 | emptyDir:
46 | medium: HugePages
47 | - name: testfile
48 | configMap:
49 | name: test-data
50 | resourceClaims:
51 | - name: qat-resource-dc
52 | resourceClaimTemplateName: qat-template-dc
53 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qat-dpdk-test/crypto-perf.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: Pod
3 | apiVersion: v1
4 | metadata:
5 | name: qat-dpdk-test-crypto-perf
6 | spec:
7 | containers:
8 | - name: crypto-perf
9 | image: crypto-perf:devel
10 | imagePullPolicy: IfNotPresent
11 | env:
12 | - name: TESTCMD
13 | value: "crypto"
14 | - name: PTEST
15 | value: "--ptest throughput --devtype crypto_qat --optype cipher-only --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16 --total-ops 10000000 --burst-sz 32 --buffer-sz 64"
16 | volumeMounts:
17 | - mountPath: /dev/hugepages
18 | name: hugepage
19 | - mountPath: /var/run/dpdk
20 | name: dpdk-runtime
21 | resources:
22 | claims:
23 | - name: qat-resource-sym
24 | requests:
25 | cpu: "3"
26 | memory: "128Mi"
27 | hugepages-2Mi: "128Mi"
28 | limits:
29 | cpu: "3"
30 | memory: "128Mi"
31 | hugepages-2Mi: "128Mi"
32 | securityContext:
33 | readOnlyRootFilesystem: true
34 | allowPrivilegeEscalation: false
35 | capabilities:
36 | add:
37 | ["IPC_LOCK"]
38 | restartPolicy: Never
39 | volumes:
40 | - name: dpdk-runtime
41 | emptyDir:
42 | medium: Memory
43 | - name: hugepage
44 | emptyDir:
45 | medium: HugePages
46 | resourceClaims:
47 | - name: qat-resource-sym
48 | resourceClaimTemplateName: qat-template-sym
49 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qat-dpdk-test/kustomization.yaml:
--------------------------------------------------------------------------------
1 | configMapGenerator:
2 | - files:
3 | - file.txt
4 | name: test-data
5 |
6 | resources:
7 | - crypto-perf.yaml
8 | - compress-perf.yaml
9 |
10 | apiVersion: kustomize.config.k8s.io/v1beta1
11 | kind: Kustomization
12 | images:
13 | - name: crypto-perf:devel
14 | newName: intel/crypto-perf
15 | newTag: devel
16 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qat-dpdk-test/modified-cluster-setup.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeadm.k8s.io/v1beta3
2 | kind: ClusterConfiguration
3 | apiServer:
4 | extraArgs:
5 | feature-gates: "DynamicResourceAllocation=true"
6 | runtime-config: "api/alpha=true"
7 | controllerManager:
8 | extraArgs:
9 | feature-gates: "DynamicResourceAllocation=true"
10 | scheduler:
11 | extraArgs:
12 | "feature-gates": "DynamicResourceAllocation=true"
13 | ---
14 | apiVersion: kubelet.config.k8s.io/v1beta1
15 | kind: KubeletConfiguration
16 | featureGates:
17 | DynamicResourceAllocation: true
18 | # DPDK applications that use QAT devices requires cpu manager policy as static.
19 | # In addition, resources should be reserved to enable it.
20 | cpuManagerPolicy: static
21 | kubeReserved:
22 | cpu: "1"
23 | memory: "2Gi"
24 | ephemeral-storage: "1Gi"
25 | ---
26 | apiVersion: kubeadm.k8s.io/v1beta3
27 | kind: InitConfiguration
28 | nodeRegistration:
29 | criSocket: "unix:///var/run/crio/crio.sock"
30 | #criSocket: "unix:///var/run/containerd/containerd.sock"
31 | ---
32 | apiVersion: kubeproxy.config.k8s.io/v1alpha1
33 | kind: KubeProxyConfiguration
34 | featureGates:
35 | DynamicResourceAllocation: true
36 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qatlib-sample-code/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - qatlib-sample-code.yaml
3 |
4 | apiVersion: kustomize.config.k8s.io/v1beta1
5 | kind: Kustomization
6 | images:
7 | - name: openssl-qat-engine:devel
8 | newName: intel/openssl-qat-engine
9 | newTag: devel
10 |
--------------------------------------------------------------------------------
/deployments/qat/tests/qatlib-sample-code/qatlib-sample-code.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: qatlib-sample-code-sym
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: qatlib-sample-code-sym
9 | image: openssl-qat-engine:devel
10 | imagePullPolicy: IfNotPresent
11 | command: ["cpa_sample_code", "runTests=1"]
12 | securityContext:
13 | readOnlyRootFilesystem: true
14 | allowPrivilegeEscalation: false
15 | capabilities:
16 | add:
17 | ["IPC_LOCK"]
18 | resources:
19 | claims:
20 | - name: qat-resource-sym
21 | resourceClaims:
22 | - name: qat-resource-sym
23 | resourceClaimTemplateName: qat-template-sym
24 | ---
25 | apiVersion: v1
26 | kind: Pod
27 | metadata:
28 | name: qatlib-sample-code-dc
29 | spec:
30 | restartPolicy: Never
31 | containers:
32 | - name: qatlib-sample-code-dc
33 | image: openssl-qat-engine:devel
34 | imagePullPolicy: IfNotPresent
35 | command: ["cpa_sample_code", "runTests=32"]
36 | securityContext:
37 | readOnlyRootFilesystem: true
38 | allowPrivilegeEscalation: false
39 | capabilities:
40 | add:
41 | ["IPC_LOCK"]
42 | resources:
43 | claims:
44 | - name: qat-resource-dc
45 | resourceClaims:
46 | - name: qat-resource-dc
47 | resourceClaimTemplateName: qat-template-dc
48 |
--------------------------------------------------------------------------------
/deployments/qat/tests/resource-claim-template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: ResourceClaimTemplate
3 | metadata:
4 | name: qat-template-sym
5 | spec:
6 | spec:
7 | devices:
8 | requests:
9 | - name: qat-request-sym
10 | deviceClassName: qat.intel.com
11 | selectors:
12 | - cel:
13 | expression: |-
14 | device.attributes["qat.intel.com"].services == "sym" ||
15 | device.attributes["qat.intel.com"].services == "sym;asym" ||
16 | device.attributes["qat.intel.com"].services == "sym;dc" ||
17 | device.attributes["qat.intel.com"].services == "asym;sym" ||
18 | device.attributes["qat.intel.com"].services == "dc;sym"
19 | ---
20 | apiVersion: resource.k8s.io/v1beta1
21 | kind: ResourceClaimTemplate
22 | metadata:
23 | name: qat-template-asym
24 | spec:
25 | spec:
26 | devices:
27 | requests:
28 | - name: qat-request-asym
29 | deviceClassName: qat.intel.com
30 | selectors:
31 | - cel:
32 | expression: |-
33 | device.attributes["qat.intel.com"].services == "asym" ||
34 | device.attributes["qat.intel.com"].services == "asym;sym" ||
35 | device.attributes["qat.intel.com"].services == "asym;dc" ||
36 | device.attributes["qat.intel.com"].services == "sym;asym" ||
37 | device.attributes["qat.intel.com"].services == "dc;asym"
38 | ---
39 | apiVersion: resource.k8s.io/v1beta1
40 | kind: ResourceClaimTemplate
41 | metadata:
42 | name: qat-template-dc
43 | spec:
44 | spec:
45 | devices:
46 | requests:
47 | - name: qat-request-dc
48 | deviceClassName: qat.intel.com
49 | selectors:
50 | - cel:
51 | expression: |-
52 | device.attributes["qat.intel.com"].services == "dc" ||
53 | device.attributes["qat.intel.com"].services == "dc;sym" ||
54 | device.attributes["qat.intel.com"].services == "dc;asym" ||
55 | device.attributes["qat.intel.com"].services == "sym;dc" ||
56 | device.attributes["qat.intel.com"].services == "asym;dc" ||
57 | device.attributes["qat.intel.com"].services == "dcc"
58 |
--------------------------------------------------------------------------------
/doc/CLUSTER_SETUP.md:
--------------------------------------------------------------------------------
1 | # Setting up new K8s cluster for usage with Dynamic Resource Allocation resource drivers
2 |
3 | - In any uncertainty, refer to main [Kubernetes installation documentation](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) .
4 | - Check what version of Kubernetes is [required](../README.md#supported-kubernetes-versions)
5 | - Ensure you are running either CRI-O 1.23+ or Containerd 1.7+ with CDI support enabled, and that [cluster-config](../hack/clusterconfig.yaml) file uses `criSocket` matching it.
6 | - Make sure to enable both `DynamicResourceAllocation`
7 | [feature-gate](https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/),
8 | and alpha API for the Kubernetes api-server during your cluster initialization.
9 | - Example cluster initialization is in [cluster-config](../hack/clusterconfig.yaml) file
10 | ```bash
11 | sudo -E kubeadm init --config hack/clusterconfig.yaml
12 | ```
13 | - Deploy cni .
14 | - Verify that `coredns` pod(s) are up: `kubectl get pods -A | grep dns`.
15 |
16 | ## Enable CDI in Containerd
17 |
18 | Containerd config file should have `enable_cdi` and `cdi_specs_dir`. Example `/etc/containerd/config.toml`:
19 | ```
20 | version = 2
21 | [plugins]
22 | [plugins."io.containerd.grpc.v1.cri"]
23 | enable_cdi = true
24 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"]
25 | ```
26 |
27 | ## Using minikube
28 |
29 | To create a minikube cluster with DRA, use the command (change the K8s version in the last parameter if needed):
30 | ```shell
31 | minikube start \
32 | --feature-gates=DynamicResourceAllocation=true \
33 | --extra-config=apiserver.feature-gates=DynamicResourceAllocation=true \
34 | --extra-config=apiserver.runtime-config=resource.k8s.io/v1beta1=true \
35 | --extra-config=scheduler.feature-gates=DynamicResourceAllocation=true \
36 | --extra-config=controller-manager.feature-gates=DynamicResourceAllocation=true \
37 | --extra-config=kubelet.feature-gates=DynamicResourceAllocation=true \
38 | --container-runtime=containerd \
39 | --kubernetes-version=1.32.0
40 | ```
41 |
42 | Minikube will start its own Containerd inside the minikube docker container, where CDI needs to be
43 | enabled. Connect to the minikube container and edit containerd config:
44 | ```shell
45 | docker exec -it minikube /bin/bash
46 | vi /etc/containerd/config.toml
47 | ```
48 |
49 | Add two lines into the `[plugins."io.containerd.grpc.v1.cri"]` section:
50 | ```
51 | [plugins."io.containerd.grpc.v1.cri"]
52 | enable_cdi = true
53 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"]
54 | ```
55 |
56 | Then save it, exit editor, and restart the containerd that runs inside the minikube
57 | ```
58 | systemctl restart containerd
59 | ```
60 |
61 | At last, exit from the minikube container.
62 |
--------------------------------------------------------------------------------
/doc/cdi-spec-generator/BUILD.md:
--------------------------------------------------------------------------------
1 | # How to build Intel CDI Spec Generator
2 | A pre-compiled binary is already available for download, eliminating the need for manual building. See documentation [README.md](README.md#Releases)
3 |
4 | ## Prerequisites
5 | - Go 1.22
6 |
7 | ## Building
8 | 1. Clone the repository
9 | ```bash
10 | git clone https://github.com/intel/intel-resource-drivers-for-kubernetes.git
11 | cd intel-resource-drivers-for-kubernetes/cmd/cdi-specs-generator
12 | ```
13 |
14 | 2. Build the executable
15 | ```bash
16 | go build -o intel-cdi-specs-generator main.go
17 | ```
18 | This command will generate an executable named intel-cdi-specs-generator in the current directory.
19 |
20 | ## Verification
21 | To verify that the build was successful, you can check the version of the tool by running:
22 | ```bash
23 | intel-cdi-specs-generator --version
24 | ```
--------------------------------------------------------------------------------
/doc/cdi-spec-generator/README.md:
--------------------------------------------------------------------------------
1 | # Intel CDI Spec Generator
2 |
3 | ## Overview
4 | The Intel CDI Specs Generator is a command line tool to generate Container Device Interface (CDI) specifications for supported accelerators.
5 |
6 | ## Prerequisites
7 | - Administrative privileges on the system to write CDI specs.
8 |
9 | ## Usage
10 | Execute the built executable with the type of device you wish to generate CDI specs for:
11 | ```bash
12 | intel-cdi-specs-generator
13 | ```
14 |
15 | Supported device types:
16 | - gpu: Use this option to generate CDI specs for Intel GPUs.
17 | - gaudi: Use this option to generate CDI specs for Intel Gaudi accelerators.
18 |
19 | ## Display Version
20 | To display the version of the binary, use the following command:
21 | ```bash
22 | intel-cdi-specs-generator --version
23 | ```
24 |
25 | ## Example Usage
26 | To generate CDI specifications for GPUs, run the tool with gpu as an argument:
27 | ```bash
28 | intel-cdi-specs-generator gpu
29 | ```
30 | This command will detect supported GPUs on the system, and ensure that there is a CDI device record for each of them.
31 |
32 |
33 | ## Building
34 | - [How to build CDI Spec Generator](BUILD.md)
35 |
36 | ## Releases
37 | The binary is available for download in the releases section:
38 | - [Intel Resource Drivers for Kubernetes releases](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases)
39 | - [CDI Spec Generator v0.1.0](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0)
40 |
--------------------------------------------------------------------------------
/doc/gaudi/BUILD.md:
--------------------------------------------------------------------------------
1 | # How to build Intel Gaudi Resource Driver container image
2 |
3 | ## Platforms supported
4 |
5 | - Linux
6 |
7 | ## Prerequisites
8 |
9 | - Docker or Podman.
10 |
11 | ## Building
12 |
13 | `Makefile` automates this, only required tool is Docker or Podman.
14 | To build the container image locally, from the root of this Git repository:
15 | ```bash
16 | make gaudi-container-build
17 | ```
18 |
19 | It is possible to specify custom registry, container image name, and version (tag) as separate
20 | variables to override any part of release container image URL in the build command, e.g.:
21 | ```bash
22 | REGISTRY=myregistry GAUDI_IMAGE_NAME=myimage GAUDI_IMAGE_VERSION=myversion make gaudi-container-build
23 | ```
24 |
25 | or whole resulting image URL (this will ignore REGISTRY, GAUDI_IMAGE_NAME, GAUDI_IMAGE_VERSION even if specified):
26 | ```bash
27 | GAUDI_IMAGE_TAG=myregistry/myimagename:myversion make gaudi-container-build
28 | ```
29 |
30 | To build the container image and push image to the destination registry straight away:
31 | ```bash
32 | REGISTRY=registry.local make gaudi-container-push
33 | ```
34 | or
35 | ```bash
36 | GAUDI_IMAGE_TAG=registry.local/intel-gaudi-resource-driver:latest make gaudi-container-push
37 | ```
38 |
--------------------------------------------------------------------------------
/doc/gaudi/README.md:
--------------------------------------------------------------------------------
1 | # Intel Gaudi resource driver for Kubernetes
2 |
3 | CAUTION: This is an beta / non-production software, do not use on production clusters.
4 |
5 | ## About resource driver
6 |
7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates
8 | the resoruces and resource driver's kubelet-plugin ensures that the allocated devices are prepared
9 | and available for Pods.
10 |
11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+.
12 |
13 | ## Supported Kubernetes Versions
14 |
15 | Supported Kubernetes versions are listed below:
16 |
17 | | Branch | Kubernetes branch/version | Status | DRA |
18 | |:------------------|:--------------------------------|:------------|:-------------------------------|
19 | | v0.1.0 | Kubernetes v1.27 ~ v1.30 | supported | Classic, Structured Parameters |
20 | | v0.2.0 | Kubernetes v1.31 | unsupported | Structured Parameters |
21 | | v0.3.0 | Kubernetes v1.32+ | supported | Structured Parameters |
22 |
23 | ## Documentation
24 |
25 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md)
26 | - [How to deploy and use Intel Gaudi resource driver](USAGE.md)
27 | - Optional: [How to build Intel Gaudi resource driver container image](BUILD.md)
28 |
--------------------------------------------------------------------------------
/doc/gpu/BUILD.md:
--------------------------------------------------------------------------------
1 | # How to build Intel GPU Resource Driver container image
2 |
3 | ## Platforms supported
4 |
5 | - Linux
6 |
7 | ## Prerequisites
8 |
9 | - Docker or Podman.
10 |
11 | ## Building
12 |
13 | `Makefile` automates this, only required tool is Docker or Podman.
14 | To build the container image locally, from the root of this Git repository:
15 | ```bash
16 | make gpu-container-build
17 | ```
18 |
19 | It is possible to specify custom registry, container image name, and version (tag) as separate
20 | variables to override any part of release container image URL in the build command, e.g.:
21 | ```bash
22 | REGISTRY=myregistry GPU_IMAGE_NAME=myimage GPU_IMAGE_VERSION=myversion make gpu-container-build
23 | ```
24 |
25 | or whole resulting image URL (this will ignore REGISTRY, GPU_IMAGE_NAME, GPU_IMAGE_VERSION even if specified):
26 | ```bash
27 | GPU_IMAGE_TAG=myregistry/myimagename:myversion make gpu-container-build
28 | ```
29 |
30 | To build the container image and push image to the destination registry straight away:
31 | ```bash
32 | REGISTRY=registry.local make gpu-container-push
33 | ```
34 | or
35 | ```bash
36 | GPU_IMAGE_TAG=registry.local/intel-gpu-resource-driver:latest make gpu-container-push
37 | ```
38 |
--------------------------------------------------------------------------------
/doc/gpu/README.md:
--------------------------------------------------------------------------------
1 | # Intel GPU resource driver for Kubernetes
2 |
3 | CAUTION: This is an beta / non-production software, do not use on production clusters.
4 |
5 | ## About resource driver
6 |
7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates
8 | the resoruces and resource driver's kubelet-plugin ensures that the allocated devices are prepared
9 | and available for Pods.
10 |
11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+.
12 |
13 | ## Supported GPU devices (with Linux kernel Intel `i915` GPU driver):
14 | - Intel® Data Center GPU Max Series
15 | - Intel® Data Center GPU Flex Series
16 | - Intel® Arc A-Series
17 | - Intel® Iris® Xe MAX
18 | - Intel® Integrated graphics
19 |
20 | ## Supported Kubernetes Versions
21 |
22 | Supported Kubernetes versions are listed below:
23 |
24 | | Branch | Kubernetes branch/version | Status | DRA |
25 | |:------------------|:---------------------------------|:------------|:-------------------------------|
26 | | v0.1.0-beta | Kubernetes v1.26 branch v1.26.x | unsupported | Classic |
27 | | v0.1.1-beta | Kubernetes v1.27 branch v1.27.x | unsupported | Classic |
28 | | v0.2.0 | Kubernetes v1.28 branch v1.28.x | unsupported | Classic |
29 | | v0.3.0 | Kubernetes v1.28+ | unsupported | Classic |
30 | | v0.4.0 | Kubernetes v1.28+ | unsupported | Classic |
31 | | v0.5.0 | Kubernetes v1.27 - v1.30 | supported | Classic, Structured Parameters |
32 | | v0.6.0 | Kubernetes v1.31 | unsupported | Structured Parameters |
33 | | v0.7.0 | Kubernetes v1.32+ | supported | Structured Parameters |
34 |
35 | ## Documentation
36 |
37 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md)
38 | - [How to deploy and use Intel GPU resource driver](USAGE.md)
39 | - Optional: [How to build Intel GPU resource driver container image](BUILD.md)
--------------------------------------------------------------------------------
/doc/gpu/allocation-delayed.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 | title "Delayed allocation"
3 |
4 | actor Actor
5 | participant ResourceClaim
6 | participant Pod
7 | participant Controller
8 | participant Plugin
9 |
10 | Actor -> ResourceClaim : deploy
11 | ResourceClaim -> Controller : notify
12 | note right of Controller
13 | the difference is here
14 | end note
15 | Controller -> Controller : wait for first user
16 | Actor -> Pod : deploy
17 | Pod -> Controller : find suitable nodes
18 | Pod -> Controller : Allocate on Node N
19 | Plugin -> ResourceClaim : prepare resource and mark Ready
20 |
21 | @enduml
22 |
23 |
--------------------------------------------------------------------------------
/doc/gpu/allocation-immediate.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 | title "Immediate allocation"
3 |
4 | actor Actor
5 | participant ResourceClaim
6 | participant Pod
7 | participant Controller
8 | participant Plugin
9 |
10 | Actor -> ResourceClaim : deploy
11 | ResourceClaim -> Controller : notify
12 | note right of Controller
13 | the difference is here
14 | end note
15 | Controller -> Controller : find suitable nodes
16 | Controller -> Controller : Allocate on Node N
17 | Actor -> Pod : deploy
18 | Plugin -> ResourceClaim : prepare resource and mark Ready
19 |
20 | @enduml
21 |
22 |
--------------------------------------------------------------------------------
/doc/gpu/complete-overview.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 |
3 | left to right direction
4 | allowmixing
5 |
6 |
7 | component "CRD resource-classes" {
8 | component "resource-class0" {
9 | component "CRD resource-class0-parameters"
10 | }
11 | component "resource-class1" {
12 | component "CRD resource-class1-parameters"
13 | }
14 | }
15 |
16 | component "CRD nodeallocationstats" as crdnas {
17 | cloud "node0" as nasnode0 {
18 | component "allocatable GPUs" as allocatable
19 | component "claim-requests" as requests
20 | component "claim-allocations" as allocations
21 | }
22 | }
23 |
24 | node "control-plane" as cp {
25 | component "Scheduler / DRA-controller" as scheduler
26 | component "R-D controller" as rdcontroller
27 | component "API" as api
28 | }
29 |
30 | node "node0" as wn {
31 | component "Pod" as pod
32 | component "R-D kubelet-plugin" as rdplugin
33 | }
34 |
35 | component "resourceclaim0\n\nresource-class0\nparametersRef:" as resclaim0 {
36 | component resclaimparams0 [
37 | type: gpu,
38 | memory: 256,
39 | millicores: 100
40 | ]
41 | }
42 |
43 | package "Pod.yaml" as podyaml {
44 | }
45 |
46 | package "ResourceClaim.yaml" as resclaimyaml {
47 | }
48 |
49 | podyaml ..> api : deploy
50 | resclaimyaml ..> api : deploy
51 |
52 | cloud "Schedule Pod" as schedulepod {
53 | }
54 |
55 | api ..> schedulepod
56 | schedulepod ..> scheduler
57 | rdplugin --> allocatable : 0. populate & sync with CDI/CRD
58 | api --> resclaim0 : 1. create
59 | resclaim0 --> rdcontroller : 2. notify
60 | rdcontroller --> requests : 3. create
61 | rdcontroller --> requests : 3. create
62 | crdnas --> rdplugin : 4. allocate and update
63 |
64 | scheduler <=> rdcontroller : unsuitableNodes
65 | rdcontroller --> nasnode0 : enough resources?
66 |
67 | @enduml
68 |
--------------------------------------------------------------------------------
/doc/gpu/generate-pngs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if ! type plantuml &> /dev/null; then
4 | echo "ERR: No plantuml found in PATH, plantuml is needed to produce PNG files"
5 | exit 1
6 | fi
7 |
8 | # source files are in script dir
9 | dir=${0%/*}
10 |
11 | for puml in "$dir"/*puml; do
12 | png="${puml%.puml}.png"
13 | # update if PNG missing or older that source file
14 | if test "$puml" -nt "$png"; then
15 | echo "$puml"
16 | plantuml "$puml" "$png"
17 | fi
18 | done
19 |
--------------------------------------------------------------------------------
/doc/gpu/high-level-overview.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 |
3 | allowmixing
4 |
5 | actor User
6 |
7 | component "resourceclaim0\n\nresourceClass: class0\nparametersRef:" as resclaim0 {
8 | component resclaimparams0 [
9 | type: gpu,
10 | memory: 256,
11 | millicores: 100,
12 | count: 1,
13 | ]
14 | }
15 |
16 | component "resource-classes" {
17 | component "class0" {
18 | component "class0-parameters"
19 | }
20 | component "class1" {
21 | component "class1-parameters"
22 | }
23 | }
24 |
25 | left to right direction
26 |
27 | User --> resclaim0 : deploy
28 |
29 | @enduml
30 |
31 |
--------------------------------------------------------------------------------
/doc/qat/BUILD.md:
--------------------------------------------------------------------------------
1 | # How to build Intel® QAT Resource Driver container image
2 |
3 | ## Platforms supported
4 |
5 | - Linux
6 |
7 | ## Prerequisites
8 |
9 | - Docker or Podman.
10 |
11 | ## Building
12 |
13 | `Makefile` automates this, only required tool is Docker or Podman.
14 | To build the container image locally, from the root of this Git repository:
15 | ```bash
16 | make qat-container-build
17 | ```
18 |
19 | It is possible to specify custom registry, container image name, and version (tag) as separate
20 | variables to override any part of release container image URL in the build command, e.g.:
21 | ```bash
22 | REGISTRY=myregistry QAT_IMAGE_NAME=myimage QAT_IMAGE_VERSION=myversion make qat-container-build
23 | ```
24 |
25 | or whole resulting image URL (this will ignore REGISTRY, QAT_IMAGE_NAME, QAT_IMAGE_VERSION even if specified):
26 | ```bash
27 | QAT_IMAGE_TAG=myregistry/myimagename:myversion make qat-container-build
28 | ```
29 |
30 | To build the container image and push image to the destination registry straight away:
31 | ```bash
32 | REGISTRY=registry.local make qat-container-push
33 | ```
34 | or
35 | ```bash
36 | QAT_IMAGE_TAG=registry.local/intel-qat-resource-driver:latest make qat-container-push
37 | ```
38 |
--------------------------------------------------------------------------------
/doc/qat/README.md:
--------------------------------------------------------------------------------
1 | # Intel® QAT resource driver for Kubernetes
2 |
3 | CAUTION: This is an beta / non-production software, do not use on production clusters.
4 |
5 | ## About resource driver
6 |
7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates
8 | the resources and resource driver's kubelet-plugin ensures that the allocated devices are prepared
9 | and available for Pods.
10 |
11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+.
12 |
13 | ## Host OS requirements
14 |
15 | In order to guarantee proper operation, ensure Linux kernel module `vfio_pci` has been loaded.
16 |
17 | The QAT Kubernetes resource driver is intended to be used on upstream Linux kernels,
18 | see [the in-tree kernel documentation](https://intel.github.io/quickassist/RN/In-Tree/in_tree_firmware_RN.html)
19 | for details. Note though, that the QAT resource driver itself does not depend on
20 | any QAT user space libraries mentioned in that document.
21 |
22 | ## Supported QAT devices
23 |
24 | All 4th Gen Intel® Xeon® Scalable Processor QAT devices handled by the Linux kernel
25 | driver module `qat_4xxx` are supported.
26 |
27 | ## Supported Kubernetes Versions
28 |
29 | Supported Kubernetes versions are listed below:
30 |
31 | | Branch | Kubernetes branch/version | Status | DRA |
32 | |:------------------|:--------------------------------|:------------|:-------------------------------|
33 | | v0.1.0 | Kubernetes v1.31 | unsupported | Structured Parameters |
34 | | v0.2.0 | Kubernetes v1.32+ | supported | Structured Parameters |
35 |
36 | ## QAT service configuration
37 |
38 | In version 0.1.0 static configuration of QAT services is done using a ConfigMap,
39 | please have a look at
40 | [the example ConfigMap yaml](../../deployments/qat/examples/intel-qat-resource-driver-configuration.yaml).
41 |
42 | The ConfigMap and Resource Claims use the same string notation as the QAT kernel
43 | driver when specifying what services are to be configured for the device and Resource
44 | Claim. When two services are requested, the service strings are to be separated by
45 | semicolon (';'). Supported services are:
46 | * Symmetric cryptography: `sym`
47 | * Asymmetric cryptograpy: `asym`
48 | * Compression: `dc`
49 |
50 | ## Documentation
51 |
52 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md)
53 | - [How to deploy and use Intel® QAT resource driver](USAGE.md)
54 | - Optional: [How to build Intel® QAT resource driver container image](BUILD.md)
--------------------------------------------------------------------------------
/doc/qat/TESTING.md:
--------------------------------------------------------------------------------
1 | # Test Cases
2 |
3 | ## Intel® QAT Device Plugin
4 | There are test cases made for [Intel® QAT Device Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md).
5 | It is possible to run those images using this resource driver. Those images are
6 | available in the following links.
7 |
8 | - [qatlib-sample-code](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/openssl-qat-engine)
9 | - [qat-dpdk-test](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/crypto-perf)
10 |
11 | Build the images in your environment, create a resourceClaimTemplate and run
12 | the pods with the following commands.
13 | ```
14 | kubectl apply -f deployments/qat/tests/resource-claim-template.yaml
15 | kubectl apply -k deployments/qat/tests/qatlib-sample-code
16 | kubectl apply -k deployments/qat/tests/qat-dpdk-test
17 | ```
18 | All cases include both crypto and compress tests.
19 |
20 | To run `qat-dpdk-test`, the cluster should have `CPU Manager Policy` as `static`
21 | in its kubelet configuration. In addition, `hugepages-2Mi` resource should be
22 | available.
23 |
24 | There is an example [cluster setup yaml](../../deployments/qat/tests/qat-dpdk-test/modified-cluster-setup.yaml)
25 | for setting cpu manager policy as static. Re-create the cluster with the
26 | configurations enabled.
27 |
--------------------------------------------------------------------------------
/doc/qat/USAGE.md:
--------------------------------------------------------------------------------
1 | ## Requirements
2 |
3 | - Kubernetes 1.32+, with `DynamicResourceAllocation` feature-flag enabled, and
4 | [other cluster parameters](../../hack/clusterconfig.yaml)
5 | - Container runtime needs to support CDI:
6 | - CRI-O v1.23.0 or newer
7 | - Containerd v1.7 or newer
8 |
9 | ## Deploy resource-driver
10 |
11 | Deploy DeviceClass, Namespace and ResourceDriver
12 | ```bash
13 | kubectl apply -k deployments/qat/
14 | ```
15 |
16 | By default, the kubelet-plugin is deployed on _all_ nodes in the cluster, as no nodeSelector is defined.
17 | To restrict the deployment to QAT-enabled nodes, follow these steps:
18 |
19 | 1. Install Node Feature Discovery (NFD):
20 |
21 | Follow [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery) documentation to install and configure NFD in your cluster.
22 |
23 | ```bash
24 | kubectl apply -k "https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.1"
25 | ```
26 |
27 | 2. Apply NFD Rules:
28 |
29 | ```bash
30 | kubectl apply -k deployments/qat/overlays/nfd_labeled_nodes/
31 | ```
32 | After NFD is installed and running, make sure the target node is labeled with:
33 | ```bash
34 | intel.feature.node.kubernetes.io/qat: "true"
35 | ```
36 |
37 | When deploying custom-built resource driver image, change `image:` lines in
38 | [resource-driver](../../deployments/qat/base/resource-driver.yaml) to match its location.
39 |
40 |
41 | ## `deployment/` directory contains all required YAMLs:
42 |
43 | * `deployments/qat/base/device-class.yaml` - pre-defined DeviceClass that ResourceClaims can refer to.
44 | * `deployments/qat/base/namespace.yaml` - Kubernetes namespace for QAT resource driver.
45 | * `deployments/qat/base/resource-driver.yaml` - actual resource driver with service account and RBAC policy
46 | - kubelet-plugin DaemonSet - node-agent which performs three functions:
47 | 1) discovery of supported hardware on the Kubernetes cluster node and its announcement as a ResourceSlice.
48 | 2) preparation of the hardware allocated to the ResourceClaims for the Pod that is being started on the node.
49 | 3) unpreparation of the hardware allocated to the ResourceClaims for the Pod that has stopped and reached final state on the node.
50 |
51 | ### Example use case: Pod with QAT accelerator
52 |
53 | The simplest way to use the Intel® QAT resource driver is to create a ResourceClaim
54 | and add it to the Pod spec. The Intel® QAT resource driver will take care of allocating
55 | a suitable device to the Resource Claim when Kubernetes schedules the Pod on the node.
56 |
57 | Example:
58 | ```
59 | apiVersion: resource.k8s.io/v1beta1
60 | kind: ResourceClaimTemplate
61 | metadata:
62 | name: qat-template-sym
63 | spec:
64 | spec:
65 | devices:
66 | requests:
67 | - name: qat-request-sym
68 | deviceClassName: qat.intel.com
69 | selectors:
70 | - cel:
71 | expression: |-
72 | device.attributes["qat.intel.com"].services == "sym" ||
73 | device.attributes["qat.intel.com"].services == "sym;asym" ||
74 | device.attributes["qat.intel.com"].services == "sym;dc" ||
75 | device.attributes["qat.intel.com"].services == "asym;sym" ||
76 | device.attributes["qat.intel.com"].services == "dc;sym" ||
77 |
78 | ---
79 | apiVersion: v1
80 | kind: Deployment
81 | metadata:
82 | name: qat-sample-sym
83 | labels:
84 | app: inline-qat-deployment
85 | spec:
86 | replicas: 1
87 | selector:
88 | matchLabels:
89 | app: inline-qat-deployment
90 | template:
91 | metadata:
92 | labels:
93 | app: inline-qat-deployment
94 | spec:
95 | containers:
96 | - name: with-resource
97 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2
98 | command: ["sh", "-c", "ls -la /dev/vfio/ && sleep 300"]
99 | securityContext:
100 | capabilities:
101 | add:
102 | ["IPC_LOCK"]
103 | resources:
104 | claims:
105 | - name: resource-sym
106 | resourceClaims:
107 | - name: resource-sym
108 | resourceClaimTemplateName: qat-template-sym
109 | ```
110 | QAT services are matched by CEL expression; in the example above, `sym` and `asym`
111 | services are considered in the regular expression. Examples of other common service
112 | matches include `sym;asym`, `[^a]?sym` and `dc`, see [README](README.md#qat-service-configuration).
113 |
114 | `IPC_LOCK` capability is required sinces VFIO based device access expects IPC_LOCK with the QAT sw stack.
115 |
--------------------------------------------------------------------------------
/gaudi.mk:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | GAUDI_VERSION ?= v0.4.1
17 | GAUDI_IMAGE_NAME ?= intel-gaudi-resource-driver
18 | GAUDI_IMAGE_VERSION ?= $(GAUDI_VERSION)
19 | GAUDI_IMAGE_TAG ?= $(REGISTRY)/$(GAUDI_IMAGE_NAME):$(GAUDI_IMAGE_VERSION)
20 |
21 | GAUDI_BINARIES = \
22 | bin/kubelet-gaudi-plugin
23 |
24 | GAUDI_COMMON_SRC = \
25 | $(COMMON_SRC) \
26 | pkg/gaudi/cdihelpers/*.go \
27 | pkg/gaudi/device/*.go \
28 | pkg/gaudi/discovery/*.go
29 |
30 | # Gaudi DRA driver is not statically built, it depends on libhlml.so, therefore
31 | # the -extldflags ${EXT_LDFLAGS} is not used.
32 | GAUDI_LDFLAGS = ${LDFLAGS} -X ${PKG}/pkg/version.driverVersion=${GAUDI_VERSION}
33 |
34 | .PHONY: gaudi
35 | gaudi: $(GAUDI_BINARIES)
36 |
37 | bin/kubelet-gaudi-plugin: cmd/kubelet-gaudi-plugin/*.go $(GAUDI_COMMON_SRC)
38 | GOOS=linux GOARCH=${ARCH} \
39 | go build -a -ldflags "${GAUDI_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gaudi-plugin
40 |
41 | .PHONY: gaudi-container-build
42 | gaudi-container-build: cleanall vendor
43 | @echo "Building Gaudi resource driver container..."
44 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(GAUDI_IMAGE_TAG) \
45 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) \
46 | --build-arg HTTP_PROXY=$(http_proxy) \
47 | --build-arg HTTPS_PROXY=$(https_proxy) \
48 | --build-arg NO_PROXY=$(no_proxy) \
49 | -f Dockerfile.gaudi .
50 |
51 | .PHONY: gaudi-container-push
52 | gaudi-container-push: gaudi-container-build
53 | $(DOCKER) push $(GAUDI_IMAGE_TAG)
54 |
--------------------------------------------------------------------------------
/gpu.mk:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Use a custom version for E2E tests if we are testing in CI
16 | GPU_VERSION ?= v0.7.0
17 | GPU_IMAGE_NAME ?= intel-gpu-resource-driver
18 | GPU_IMAGE_VERSION ?= $(GPU_VERSION)
19 | GPU_IMAGE_TAG ?= $(REGISTRY)/$(GPU_IMAGE_NAME):$(GPU_IMAGE_VERSION)
20 |
21 | GPU_BINARIES = \
22 | bin/kubelet-gpu-plugin
23 |
24 | GPU_COMMON_SRC = \
25 | $(COMMON_SRC) \
26 | pkg/gpu/cdihelpers/*.go \
27 | pkg/gpu/device/*.go \
28 | pkg/gpu/discovery/*.go
29 |
30 | GPU_LDFLAGS = ${LDFLAGS} -extldflags $(EXT_LDFLAGS) -X ${PKG}/pkg/version.driverVersion=${GPU_VERSION}
31 |
32 | .PHONY: gpu
33 | gpu: $(GPU_BINARIES)
34 |
35 | bin/kubelet-gpu-plugin: cmd/kubelet-gpu-plugin/*.go $(GPU_COMMON_SRC)
36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \
37 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gpu-plugin
38 |
39 | bin/alert-webhook: cmd/alert-webhook/*.go $(GPU_COMMON_SRC)
40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \
41 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/alert-webhook
42 |
43 | .PHONY: gpu-container-build
44 | gpu-container-build: cleanall vendor
45 | @echo "Building GPU resource drivers container..."
46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(GPU_IMAGE_TAG) \
47 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) -f Dockerfile.gpu .
48 |
49 | .PHONY: gpu-container-push
50 | gpu-container-push: gpu-container-build
51 | $(DOCKER) push $(GPU_IMAGE_TAG)
52 |
--------------------------------------------------------------------------------
/hack/boilerplate.go.txt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
--------------------------------------------------------------------------------
/hack/clusterconfig.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeadm.k8s.io/v1beta3
2 | kind: ClusterConfiguration
3 | apiServer:
4 | extraArgs:
5 | feature-gates: "DynamicResourceAllocation=true"
6 | runtime-config: "resource.k8s.io/v1beta1=true"
7 | controllerManager:
8 | extraArgs:
9 | feature-gates: "DynamicResourceAllocation=true"
10 | scheduler:
11 | extraArgs:
12 | "feature-gates": "DynamicResourceAllocation=true"
13 | ---
14 | apiVersion: kubelet.config.k8s.io/v1beta1
15 | kind: KubeletConfiguration
16 | featureGates:
17 | DynamicResourceAllocation: true
18 | ---
19 | apiVersion: kubeadm.k8s.io/v1beta3
20 | kind: InitConfiguration
21 | nodeRegistration:
22 | criSocket: "unix:///var/run/crio/crio.sock"
23 | #criSocket: "unix:///var/run/containerd/containerd.sock"
24 | ---
25 | apiVersion: kubeproxy.config.k8s.io/v1alpha1
26 | kind: KubeProxyConfiguration
27 | featureGates:
28 | DynamicResourceAllocation: true
29 |
--------------------------------------------------------------------------------
/hack/fake_libhlml/Makefile:
--------------------------------------------------------------------------------
1 | CC = gcc
2 | CFLAGS = -O -Wall -Wextra -Wno-unused-parameter -fPIC
3 | LDFLAGS = -shared
4 | TARGET = fake_libhlml.so
5 | SRCS = $(wildcard *.c)
6 | OBJS = $(SRCS:.c=.o)
7 |
8 | all: $(TARGET)
9 |
10 | $(TARGET): $(OBJS)
11 | $(CC) $(LDFLAGS) -o $@ $^
12 |
13 | %.o: %.c
14 | $(CC) $(CFLAGS) -c $< -o $@
15 |
16 | clean:
17 | rm -f $(OBJS) $(TARGET)
18 |
19 | .PHONY: all clean
20 |
--------------------------------------------------------------------------------
/hack/fake_libhlml/README.md:
--------------------------------------------------------------------------------
1 | This implements a stub / mock for the interface defined in
2 | https://github.com/HabanaAI/gohlml/blob/main/hlml.h.
3 |
4 | The result is a shared library fake_libhlml.so - it can be used to simulate presense of Gaudi
5 | devices and kernel driver.
6 |
7 | To run tests for Gaudi health monitoring locally, follow these steps:
8 |
9 | - build hack/fake_libhlml
10 | ```
11 | cd hack/fake_libhlml
12 | make
13 | ```
14 | - deploy it where Go module expects to find it
15 | ```
16 | sudo mkdir /usr/lib/habanalabs
17 | sudo cp hack/fake_libhlml/fake_libhlml.so /usr/lib/habanalabs/libhlml.so
18 | ```
19 | - add ld config to use that library and trigger ldconfig, it will be needed for running tests
20 | with and without VSCode:
21 | ```
22 | cat << EOF | sudo tee /etc/ld.so.conf.d/habanalabs.conf
23 | /usr/lib/habanalabs/
24 | EOF
25 |
26 | sudo ldconfig
27 | ```
28 |
29 |
--------------------------------------------------------------------------------
/hack/tools.go:
--------------------------------------------------------------------------------
1 | //go:build tools
2 | // +build tools
3 |
4 | // This package imports things required by build scripts, to force `go mod` to see them as dependencies
5 | package tools
6 |
7 | import _ "k8s.io/code-generator"
8 |
--------------------------------------------------------------------------------
/pkg/fakehlml/fake_hlml.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package fakehlml
18 |
19 | /*
20 | #cgo LDFLAGS: "/usr/lib/habanalabs/libhlml.so" -ldl -Wl,--unresolved-symbols=ignore-all
21 | #include "fake_hlml.h"
22 | #include
23 | */
24 | import "C"
25 |
26 | import (
27 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device"
28 | )
29 |
30 | // KEEP THIS IDENTICAL TO fake_hlml.h call_identity_t
31 | const (
32 | FakeInit uint32 = iota
33 | FakeInitWithFlags
34 | FakeShutdown
35 | FakeDeviceGetCount
36 | FakeDeviceGetHandleByPCIBusID
37 | FakeDeviceGetHandleByIndex
38 | FakeDeviceGetHandleByUUID
39 | FakeDeviceGetName
40 | FakeDeviceGetPCIInfo
41 | FakeDeviceGetSerial
42 | FakeDeviceRegisterEvents
43 | FakeEventSetCreate
44 | FakeEventSetFree
45 | FakeEventSetWait
46 | )
47 |
48 | // KEEP THIS IDENTICAL TO hlml.h hlml_return_t
49 | const (
50 | HLMLSuccess = 0
51 | HLMLErrorUninitialized = 1
52 | HLMLErrorInvalidArgument = 2
53 | HLMLErrorNotSupported = 3
54 | HLMLErrorAlreadyInitialized = 5
55 | HLMLErrorNotFound = 6
56 | HLMLErrorInsufficientSize = 7
57 | HLMLErrorDriverNotLoaded = 9
58 | HLMLErrorTimeout = 10
59 | HLMLErrorAipIsLost = 15
60 | HLMLErrorMemory = 20
61 | HLMLErrorNoData = 21
62 | HLMLErrorUnknown = 49
63 | )
64 |
65 | func AddDevices(devicesInfo device.DevicesInfo) {
66 | for _, deviceInfo := range devicesInfo {
67 | C.add_device(
68 | C.CString(deviceInfo.PCIAddress),
69 | C.CString(deviceInfo.Model),
70 | C.CString("0x0"), // vendor
71 | C.CString(deviceInfo.Serial),
72 | C.uint(deviceInfo.DeviceIdx),
73 | )
74 | }
75 | }
76 |
77 | func Reset() {
78 | C.reset()
79 | }
80 |
81 | func SetReturnCode(callId uint32, returnCode uint32) {
82 | C.set_error(C.call_identity_t(callId), C.hlml_return_t(returnCode))
83 | }
84 |
85 | func AddCriticalEvent(serial string) {
86 | C.add_critical_event(C.CString(serial))
87 | }
88 |
89 | func ResetRvents() {
90 | C.reset_events()
91 | }
92 |
--------------------------------------------------------------------------------
/pkg/fakehlml/fake_hlml.h:
--------------------------------------------------------------------------------
1 | /* SPDX-License-Identifier: MIT
2 | *
3 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
4 | *
5 | */
6 |
7 | #ifndef __FAKE_HLML_H__
8 | #define __FAKE_HLML_H__
9 |
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 |
14 | #include "../../vendor/github.com/HabanaAI/gohlml/hlml.h"
15 |
16 | /* Enum for returned values of the different APIs */
17 | typedef enum call_identity {
18 | FAKE_INIT = 0,
19 | FAKE_INIT_WITH_FLAGS,
20 | FAKE_SHUTDOWN,
21 | FAKE_DEVICE_GET_COUNT,
22 | FAKE_DEVICE_GET_HANDLE_BY_PCI_BUS_ID,
23 | FAKE_DEVICE_GET_HANDLE_BY_INDEX,
24 | FAKE_DEVICE_GET_HANDLE_BY_UUID,
25 | FAKE_DEVICE_GET_NAME,
26 | FAKE_DEVICE_GET_PCI_INFO,
27 | FAKE_DEVICE_GET_SERIAL,
28 | FAKE_DEVICE_REGISTER_EVENTS,
29 | FAKE_EVENT_SET_CREATE,
30 | FAKE_EVENT_SET_FREE,
31 | FAKE_EVENT_SET_WAIT,
32 | FAKE_CALL_IDENTITY_MAX
33 | } call_identity_t;
34 |
35 | void add_device(const char *pci_addr, const char *pci_device_id, const char *pci_vendor_id, const char *serial, unsigned int index);
36 | void reset(void);
37 |
38 | void set_error(call_identity_t call_id, hlml_return_t errCode);
39 | void set_success(call_identity_t call_id);
40 |
41 | void add_critical_event(const char *serial);
42 | void reset_events(void);
43 |
44 | #ifdef __cplusplus
45 | } //extern "C"
46 | #endif
47 |
48 | #endif /* __FAKE_HLML_H__ */
49 |
--------------------------------------------------------------------------------
/pkg/fakesysfs/fakesysfs.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package fakesysfs
18 |
19 | import (
20 | "fmt"
21 | "os"
22 | "path"
23 | "strconv"
24 | "strings"
25 |
26 | "golang.org/x/sys/unix"
27 | )
28 |
29 | const (
30 | devNullMajor = 1
31 | devNullMinor = 3
32 | devNullType = unix.S_IFCHR
33 | )
34 |
35 | // newPCIAddress finds next available free PCI address in given directory.
36 | // Returns partial PCI address without function, "0000:00:00.", used in loop
37 | // when fake VFs are generated.
38 | func newPCIAddress(driverDir string, currentAddress string) (string, error) {
39 | domain, err1 := strconv.ParseUint(currentAddress[:4], 10, 64)
40 | bus, err2 := strconv.ParseUint(currentAddress[5:7], 10, 64)
41 | device, err3 := strconv.ParseUint(currentAddress[8:10], 10, 64)
42 |
43 | if err1 != nil || err2 != nil || err3 != nil {
44 | return "", fmt.Errorf("could not parse current PCI address %v", currentAddress)
45 | }
46 |
47 | for ; domain <= 65535; domain++ {
48 | for ; bus <= 255; bus++ {
49 | for ; device <= 255; device++ {
50 | // partial PCI address without function
51 | newAddress := fmt.Sprintf("%04x:%02x:%02x.", domain, bus, device)
52 | // add zero for PCI function part of the address
53 | newSysfsDeviceDir := path.Join(driverDir, fmt.Sprintf("%s0", newAddress))
54 | if _, err := os.Stat(newSysfsDeviceDir); err != nil {
55 | return newAddress, nil
56 | }
57 | }
58 | }
59 | }
60 |
61 | return "", fmt.Errorf("no addresses left")
62 | }
63 |
64 | // sanitizeFakeSysFsDir ensuring the /tmp location of fake sysfs.
65 | func sanitizeFakeSysFsDir(sysfsRootUntrusted string) error {
66 | // fake sysfsroot should be deletable.
67 | // To prevent disaster mistakes, it is enforced to be in /tmp.
68 | sysfsRoot := path.Join(sysfsRootUntrusted)
69 | if !strings.HasPrefix(sysfsRoot, "/tmp") {
70 | return fmt.Errorf("fake sysfsroot can only be in /tmp, got: %v", sysfsRoot)
71 | }
72 |
73 | return nil
74 | }
75 |
76 | func createDevice(filepath string) error {
77 | mode := uint32(0644 | devNullType)
78 | devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor)))
79 |
80 | if err := unix.Mknod(filepath, mode, devid); err != nil {
81 | return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w",
82 | devNullMajor, devNullMinor, filepath, err)
83 | }
84 |
85 | return nil
86 | }
87 |
--------------------------------------------------------------------------------
/pkg/gaudi/device/device.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package device
18 |
19 | import (
20 | "fmt"
21 | "path/filepath"
22 | "regexp"
23 |
24 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers"
25 | )
26 |
27 | var (
28 | PciRegexp = regexp.MustCompile(`[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]$`)
29 | AccelRegexp = regexp.MustCompile(`^accel[0-9]+$`)
30 | AccelControlRegexp = regexp.MustCompile(`^accel_controlD[0-9]+$`)
31 | ModelNames = map[string]string{
32 | "0x1000": "Gaudi",
33 | "0x1010": "Gaudi",
34 | "0x1001": "Gaudi",
35 | "0x1011": "Gaudi",
36 | "0x1020": "Gaudi2",
37 | "0x1030": "Gaudi3",
38 | "0x1060": "Gaudi3",
39 | "0x1061": "Gaudi3",
40 | "0x1062": "Gaudi3",
41 | }
42 | )
43 |
44 | const (
45 | DevfsAccelPath = "accel"
46 |
47 | // driver.sysfsDriverDir and driver.sysfsAccelDir are sysfsDriverPath and sysfsAccelPath
48 | // respectively prefixed with $SYSFS_ROOT.
49 | SysfsDriverPath = "bus/pci/drivers/habanalabs"
50 | SysfsAccelPath = "devices/virtual/accel/"
51 |
52 | CDIVendor = "intel.com"
53 | CDIClass = "gaudi"
54 | CDIKind = CDIVendor + "/" + CDIClass
55 | DriverName = CDIClass + "." + CDIVendor
56 | PCIAddressLength = len("0000:00:00.0")
57 |
58 | PreparedClaimsFileName = "preparedClaims.json"
59 | PluginRegistrarFileName = DriverName + ".sock"
60 | PluginSocketFileName = "plugin.sock"
61 |
62 | DefaultNamingStyle = "machine"
63 | VisibleDevicesEnvVarName = "HABANA_VISIBLE_DEVICES"
64 | )
65 |
66 | // DeviceInfo is an internal structure type to store info about discovered device.
67 | type DeviceInfo struct {
68 | // UID is a unique identifier on node, used in ResourceSlice K8s API object as RFC1123-compliant identifier.
69 | // Consists of PCIAddress and Model with colons and dots replaced with hyphens, e.g. 0000-01-02-0-0x12345.
70 | UID string `json:"uid"`
71 | PCIAddress string `json:"pciaddress"` // PCI address in Linux DBDF notation for use with sysfs, e.g. 0000:00:00.0
72 | Model string `json:"model"` // PCI device ID
73 | ModelName string `json:"modelname"` // SKU name of the device, e.g. Gaudi2
74 | DeviceIdx uint64 `json:"deviceidx"` // accel device number (e.g. 0 for /dev/accel/accel0)
75 | ModuleIdx uint64 `json:"moduleidx"` // OAM slot number, needed for Habana Runtime to set networking
76 | PCIRoot string `json:"pciroot"` // PCI Root complex ID
77 | Serial string `json:"serial"` // Serial number obtained through HLML library
78 | Healthy bool `json:"healthy"` // True if device is usable, false otherwise
79 | }
80 |
81 | func (g DeviceInfo) CDIName() string {
82 | return fmt.Sprintf("%s=%s", CDIKind, g.UID)
83 | }
84 |
85 | func (g *DeviceInfo) DeepCopy() *DeviceInfo {
86 | di := *g
87 | return &di
88 | }
89 |
90 | func (g *DeviceInfo) SetModelName() {
91 | if modelName, found := ModelNames[g.Model]; found {
92 | g.ModelName = modelName
93 | return
94 | }
95 | g.ModelName = "Unknown"
96 | }
97 |
98 | // DevicesInfo is a dictionary with DeviceInfo.uid being the key.
99 | type DevicesInfo map[string]*DeviceInfo
100 |
101 | func (g *DevicesInfo) DeepCopy() DevicesInfo {
102 | devicesInfoCopy := DevicesInfo{}
103 | for duid, device := range *g {
104 | devicesInfoCopy[duid] = device.DeepCopy()
105 | }
106 | return devicesInfoCopy
107 | }
108 | func GetAccelDevfsPath() string {
109 | return filepath.Join(helpers.GetDevRoot(helpers.DevfsEnvVarName, DevfsAccelPath), DevfsAccelPath)
110 | }
111 |
--------------------------------------------------------------------------------
/pkg/gaudi/device/device_test.go:
--------------------------------------------------------------------------------
1 | package device
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestCDIName(t *testing.T) {
8 | tests := []struct {
9 | name string
10 | device DeviceInfo
11 | expected string
12 | }{
13 | {
14 | name: "Valid device UID",
15 | device: DeviceInfo{
16 | UID: "0000-01-02-0-0x12345",
17 | },
18 | expected: "intel.com/gaudi=0000-01-02-0-0x12345",
19 | },
20 | {
21 | name: "Another valid device UID",
22 | device: DeviceInfo{
23 | UID: "0000-02-03-0-0x67890",
24 | },
25 | expected: "intel.com/gaudi=0000-02-03-0-0x67890",
26 | },
27 | }
28 |
29 | for _, tt := range tests {
30 | t.Run(tt.name, func(t *testing.T) {
31 | result := tt.device.CDIName()
32 | if result != tt.expected {
33 | t.Errorf("expected %v, got %v", tt.expected, result)
34 | }
35 | })
36 | }
37 | }
38 |
39 | func TestDevicesInfoDeepCopy(t *testing.T) {
40 | original := DevicesInfo{
41 | "0000-01-02-0-0x12345": {
42 | UID: "0000-01-02-0-0x12345",
43 | PCIAddress: "0000:01:02.0",
44 | Model: "0x1020",
45 | ModelName: "Gaudi2",
46 | DeviceIdx: 1,
47 | ModuleIdx: 2,
48 | PCIRoot: "0000:00",
49 | Serial: "1234567890",
50 | Healthy: true,
51 | },
52 | }
53 |
54 | copy := original.DeepCopy()
55 |
56 | if © == &original {
57 | t.Error("DeepCopy() returned the same pointer, expected different pointers")
58 | }
59 |
60 | for key, originalDevice := range original {
61 | copyDevice, exists := copy[key]
62 | if !exists {
63 | t.Errorf("DeepCopy() missing device with key %v", key)
64 | continue
65 | }
66 |
67 | if copyDevice == originalDevice {
68 | t.Errorf("DeepCopy() returned the same pointer for device with key %v, expected different pointers", key)
69 | }
70 |
71 | if *copyDevice != *originalDevice {
72 | t.Errorf("DeepCopy() returned different values for device with key %v, expected identical values", key)
73 | }
74 | }
75 | }
76 |
77 | func TestSetModelName(t *testing.T) {
78 | tests := []struct {
79 | name string
80 | deviceInfo DeviceInfo
81 | expected string
82 | }{
83 | {
84 | name: "Known model 0x1000",
85 | deviceInfo: DeviceInfo{
86 | Model: "0x1000",
87 | },
88 | expected: "Gaudi",
89 | },
90 | {
91 | name: "Known model 0x1020",
92 | deviceInfo: DeviceInfo{
93 | Model: "0x1020",
94 | },
95 | expected: "Gaudi2",
96 | },
97 | {
98 | name: "Unknown model",
99 | deviceInfo: DeviceInfo{
100 | Model: "0x9999",
101 | },
102 | expected: "Unknown",
103 | },
104 | }
105 |
106 | for _, tt := range tests {
107 | t.Run(tt.name, func(t *testing.T) {
108 | tt.deviceInfo.SetModelName()
109 | if tt.deviceInfo.ModelName != tt.expected {
110 | t.Errorf("expected %v, got %v", tt.expected, tt.deviceInfo.ModelName)
111 | }
112 | })
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/pkg/helpers/device.go:
--------------------------------------------------------------------------------
1 | package helpers
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "path"
7 | "strings"
8 | )
9 |
10 | const (
11 | SysfsEnvVarName = "SYSFS_ROOT"
12 | sysfsDefaultRoot = "/sys"
13 |
14 | DevfsEnvVarName = "DEVFS_ROOT"
15 | devfsDefaultRoot = "/dev"
16 |
17 | PCIAddressLength = len("0000:00:00.0")
18 | )
19 |
20 | // GetSysfsRoot tries to get path where sysfs is mounted from
21 | // env var, or fallback to hardcoded path.
22 | func GetSysfsRoot(sysfsPath string) string {
23 | sysfsRoot, found := os.LookupEnv(SysfsEnvVarName)
24 |
25 | if found {
26 | if _, err := os.Stat(path.Join(sysfsRoot, sysfsPath)); err == nil {
27 | fmt.Printf("using custom sysfs location: %v\n", sysfsRoot)
28 | return sysfsRoot
29 | } else {
30 | fmt.Printf("could not find sysfs at '%v' from %v env var: %v\n", sysfsPath, SysfsEnvVarName, err)
31 | }
32 | }
33 |
34 | fmt.Printf("using default sysfs location: %v\n", sysfsDefaultRoot)
35 | // If /sys is not available, devices discovery will fail gracefully.
36 | return sysfsDefaultRoot
37 | }
38 |
39 | func GetDevRoot(devfsRootEnvVarName string, devPath string) string {
40 | devfsRoot, found := os.LookupEnv(devfsRootEnvVarName)
41 |
42 | if found {
43 | if _, err := os.Stat(path.Join(devfsRoot, devPath)); err == nil {
44 | fmt.Printf("using custom devfs location: %v\n", devfsRoot)
45 | return devfsRoot
46 | } else {
47 | fmt.Printf("could not find devfs at '%v' from %v env var: %v\n", devPath, devfsRootEnvVarName, err)
48 | }
49 | }
50 |
51 | fmt.Printf("using default devfs root: %v\n", devfsDefaultRoot)
52 | return devfsDefaultRoot
53 | }
54 |
55 | func PciInfoFromDeviceUID(deviceUID string) (string, string) {
56 | // 0000-00-01-0-0x0000 -> 0000:00:01.0, 0x0000
57 | rfc1123PCIaddress := deviceUID[:PCIAddressLength]
58 | pciAddress := strings.Replace(strings.Replace(rfc1123PCIaddress, "-", ":", 2), "-", ".", 1)
59 | deviceId := deviceUID[PCIAddressLength+1:]
60 |
61 | return pciAddress, deviceId
62 | }
63 |
64 | func DeviceUIDFromPCIinfo(pciAddress string, pciid string) string {
65 | // 0000:00:01.0, 0x0000 -> 0000-00-01-0-0x0000
66 | // Replace colons and the dot in PCI address with hyphens.
67 | rfc1123PCIaddress := strings.ReplaceAll(strings.ReplaceAll(pciAddress, ":", "-"), ".", "-")
68 | newUID := fmt.Sprintf("%v-%v", rfc1123PCIaddress, pciid)
69 |
70 | return newUID
71 | }
72 |
--------------------------------------------------------------------------------
/pkg/helpers/device_test.go:
--------------------------------------------------------------------------------
1 | package helpers
2 |
3 | import (
4 | "os"
5 | "path"
6 | "testing"
7 | )
8 |
9 | func TestGetSysfsRoot(t *testing.T) {
10 | tests := []struct {
11 | name string
12 | envVarValue string
13 | sysfsPath string
14 | expected string
15 | setupEnv bool
16 | }{
17 | {
18 | name: "Custom sysfs location exists",
19 | envVarValue: TestSysfsRoot,
20 | sysfsPath: "devices",
21 | expected: TestSysfsRoot,
22 | setupEnv: true,
23 | },
24 | {
25 | name: "Custom sysfs location does not exist",
26 | envVarValue: "/invalid/sys",
27 | sysfsPath: "devices",
28 | expected: sysfsDefaultRoot,
29 | setupEnv: true,
30 | },
31 | {
32 | name: "Default sysfs location",
33 | envVarValue: "",
34 | sysfsPath: "devices",
35 | expected: sysfsDefaultRoot,
36 | setupEnv: false,
37 | },
38 | }
39 |
40 | for _, tt := range tests {
41 | t.Run(tt.name, func(t *testing.T) {
42 | if tt.setupEnv {
43 | os.Setenv(SysfsEnvVarName, tt.envVarValue)
44 | defer os.Unsetenv(SysfsEnvVarName)
45 | }
46 |
47 | if tt.envVarValue != "" {
48 | if err := os.MkdirAll(path.Join(tt.envVarValue, tt.sysfsPath), os.ModePerm); err != nil {
49 | t.Logf("failed to create directory: %v", err)
50 | }
51 | defer os.RemoveAll(tt.envVarValue)
52 | }
53 |
54 | result := GetSysfsRoot(tt.sysfsPath)
55 | if result != tt.expected {
56 | t.Errorf("expected %v, got %v", tt.expected, result)
57 | }
58 | })
59 | }
60 | }
61 |
62 | func TestGetDevRoot(t *testing.T) {
63 | tests := []struct {
64 | name string
65 | envVarName string
66 | envVarValue string
67 | devPath string
68 | expected string
69 | setupEnv bool
70 | }{
71 | {
72 | name: "Custom devfs location exists",
73 | envVarName: DevfsEnvVarName,
74 | envVarValue: TestDevfsRoot,
75 | devPath: "devices",
76 | expected: TestDevfsRoot,
77 | setupEnv: true,
78 | },
79 | {
80 | name: "Custom devfs location does not exist",
81 | envVarName: DevfsEnvVarName,
82 | envVarValue: "/invalid/dev",
83 | devPath: "devices",
84 | expected: devfsDefaultRoot,
85 | setupEnv: true,
86 | },
87 | {
88 | name: "Default devfs location",
89 | envVarName: DevfsEnvVarName,
90 | envVarValue: "",
91 | devPath: "devices",
92 | expected: devfsDefaultRoot,
93 | setupEnv: false,
94 | },
95 | }
96 |
97 | for _, tt := range tests {
98 | t.Run(tt.name, func(t *testing.T) {
99 | if tt.setupEnv {
100 | os.Setenv(tt.envVarName, tt.envVarValue)
101 | defer os.Unsetenv(tt.envVarName)
102 | }
103 |
104 | if tt.envVarValue != "" {
105 | if err := os.MkdirAll(path.Join(tt.envVarValue, tt.devPath), os.ModePerm); err != nil {
106 | t.Logf("failed to create directory: %v", err)
107 | }
108 | defer os.RemoveAll(tt.envVarValue)
109 | }
110 |
111 | result := GetDevRoot(tt.envVarName, tt.devPath)
112 | if result != tt.expected {
113 | t.Errorf("expected %v, got %v", tt.expected, result)
114 | }
115 | })
116 | }
117 | }
118 |
119 | func TestPciInfoFromDeviceUID(t *testing.T) {
120 | tests := []struct {
121 | name string
122 | deviceUID string
123 | expectedPCIAddress string
124 | expectedPCIID string
125 | }{
126 | {
127 | name: "Valid device UID",
128 | deviceUID: "1234-56-78-9-0x1234",
129 | expectedPCIAddress: "1234:56:78.9",
130 | expectedPCIID: "0x1234",
131 | },
132 | }
133 |
134 | for _, tt := range tests {
135 | t.Run(tt.name, func(t *testing.T) {
136 | pciAddress, pciID := PciInfoFromDeviceUID(tt.deviceUID)
137 | if pciAddress != tt.expectedPCIAddress || pciID != tt.expectedPCIID {
138 | t.Errorf("expected PCI address %v and PCI ID %v, got PCI address %v and PCI ID %v", tt.expectedPCIAddress, tt.expectedPCIID, pciAddress, pciID)
139 | }
140 | })
141 | }
142 | }
143 |
144 | func TestDeviceUIDFromPCIinfo(t *testing.T) {
145 | tests := []struct {
146 | name string
147 | pciAddress string
148 | pciid string
149 | expected string
150 | }{
151 | {
152 | name: "Valid PCI address and ID",
153 | pciAddress: "0000:00:01.0",
154 | pciid: "0x0000",
155 | expected: "0000-00-01-0-0x0000",
156 | },
157 | }
158 |
159 | for _, tt := range tests {
160 | t.Run(tt.name, func(t *testing.T) {
161 | result := DeviceUIDFromPCIinfo(tt.pciAddress, tt.pciid)
162 | if result != tt.expected {
163 | t.Errorf("expected %v, got %v", tt.expected, result)
164 | }
165 | })
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/pkg/helpers/driver.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package helpers
18 |
19 | import "context"
20 |
21 | type Driver interface {
22 | Shutdown(ctx context.Context) error
23 | }
24 |
--------------------------------------------------------------------------------
/pkg/helpers/helpers_test.go:
--------------------------------------------------------------------------------
1 | package helpers
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/urfave/cli/v2"
7 |
8 | "context"
9 | "flag"
10 | "os"
11 | "testing"
12 | )
13 |
14 | func TestNewAppWithFlags(t *testing.T) {
15 | driverName := "test-driver"
16 | newDriver := func(ctx context.Context, config *Config) (Driver, error) {
17 | return nil, nil
18 | }
19 |
20 | app := NewApp(driverName, newDriver, []cli.Flag{}, (interface{})(nil))
21 | set := flag.NewFlagSet("test", 0)
22 | set.String("node-name", "test-node", "doc")
23 | set.String("cdi-root", "/test/cdi", "doc")
24 | set.Int("num-devices", 10, "doc")
25 |
26 | ctx := cli.NewContext(app, set, nil)
27 |
28 | err := app.Before(ctx)
29 | if err != nil {
30 | t.Fatalf("Before function failed: %v", err)
31 | }
32 |
33 | if ctx.String("node-name") != "test-node" {
34 | t.Errorf("Expected node-name to be 'test-node', got %v", ctx.String("node-name"))
35 | }
36 |
37 | if ctx.String("cdi-root") != "/test/cdi" {
38 | t.Errorf("Expected cdi-root to be '/test/cdi', got %v", ctx.String("cdi-root"))
39 | }
40 |
41 | if ctx.Int("num-devices") != 10 {
42 | t.Errorf("Expected num-devices to be 10, got %v", ctx.Int("num-devices"))
43 | }
44 | }
45 |
46 | func TestWriteFile(t *testing.T) {
47 | tests := []struct {
48 | name string
49 | filePath string
50 | fileContents string
51 | expectError bool
52 | }{
53 | {
54 | name: "Valid file path and contents",
55 | filePath: "testfile.txt",
56 | fileContents: "Hello, World!",
57 | expectError: false,
58 | },
59 | {
60 | name: "Invalid file path",
61 | filePath: "/invalidpath/testfile.txt",
62 | fileContents: "Hello, World!",
63 | expectError: true,
64 | },
65 | }
66 |
67 | for _, tt := range tests {
68 | t.Run(tt.name, func(t *testing.T) {
69 | err := WriteFile(tt.filePath, tt.fileContents)
70 | if (err != nil) != tt.expectError {
71 | t.Errorf("WriteFile() error = %v, expectError %v", err, tt.expectError)
72 | }
73 |
74 | if !tt.expectError {
75 | content, err := os.ReadFile(tt.filePath)
76 | if err != nil {
77 | t.Fatalf("Failed to read file: %v", err)
78 | }
79 | if string(content) != tt.fileContents {
80 | t.Errorf("Expected file contents to be %v, got %v", tt.fileContents, string(content))
81 | }
82 | os.Remove(tt.filePath)
83 | }
84 | })
85 | }
86 | }
87 |
88 | func TestStartPlugin(t *testing.T) {
89 | tests := []struct {
90 | name string
91 | config *Config
92 | newDriver func(ctx context.Context, config *Config) (Driver, error)
93 | setup func()
94 | expectError bool
95 | }{
96 | {
97 | name: "CDI root is not a directory",
98 | config: &Config{
99 | CommonFlags: &Flags{
100 | KubeletPluginDir: "/tmp/testplugin",
101 | CdiRoot: "/tmp/testfile",
102 | },
103 | },
104 | setup: func() {
105 | if err := os.WriteFile("/tmp/testfile", []byte("not a directory"), 0644); err != nil {
106 | t.Fatalf("Failed to write file: %v", err)
107 | }
108 | },
109 | expectError: true,
110 | },
111 | {
112 | name: "KubeletPluginDir does not exist",
113 | config: &Config{
114 | CommonFlags: &Flags{
115 | KubeletPluginDir: "/does-not-exist",
116 | },
117 | },
118 | expectError: true,
119 | },
120 | {
121 | name: "CDIRoot does not exist",
122 | config: &Config{
123 | CommonFlags: &Flags{
124 | KubeletPluginDir: AddRandomString("/tmp/test"),
125 | CdiRoot: "/does-not-exist",
126 | },
127 | },
128 | expectError: true,
129 | },
130 | {
131 | name: "NewDriver returns error",
132 | config: &Config{
133 | CommonFlags: &Flags{
134 | KubeletPluginDir: "/tmp/testplugin",
135 | CdiRoot: "/tmp/testcdi",
136 | },
137 | },
138 | newDriver: func(ctx context.Context, config *Config) (Driver, error) {
139 | return nil, fmt.Errorf("fake error %v", "from newDriver")
140 | },
141 | expectError: true,
142 | },
143 | }
144 |
145 | for _, tt := range tests {
146 | t.Run(tt.name, func(t *testing.T) {
147 | if tt.setup != nil {
148 | tt.setup()
149 | }
150 | defer os.RemoveAll("/tmp/testplugin")
151 | defer os.RemoveAll("/tmp/testcdi")
152 | defer os.Remove("/tmp/testfile")
153 |
154 | ctx := context.Background()
155 | err := StartPlugin(ctx, tt.config, tt.newDriver)
156 | if (err != nil) != tt.expectError {
157 | t.Errorf("StartPlugin() error = %v, expectError %v", err, tt.expectError)
158 | }
159 | })
160 | }
161 | }
162 |
--------------------------------------------------------------------------------
/pkg/helpers/node_state.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package helpers
18 |
19 | import (
20 | "context"
21 | "encoding/json"
22 | "fmt"
23 | "os"
24 | "sync"
25 |
26 | "k8s.io/klog/v2"
27 | drav1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
28 | cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
29 | )
30 |
31 | type ClaimPreparations map[string][]*drav1.Device
32 |
33 | type NodeState struct {
34 | sync.Mutex
35 | CdiCache *cdiapi.Cache
36 | Allocatable interface{}
37 | Prepared ClaimPreparations
38 | PreparedClaimsFilePath string
39 | NodeName string
40 | SysfsRoot string
41 | }
42 |
43 | func (s *NodeState) Unprepare(ctx context.Context, claimUID string) error {
44 | s.Lock()
45 | defer s.Unlock()
46 |
47 | if s.Prepared[claimUID] == nil {
48 | return nil
49 | }
50 |
51 | klog.V(5).Infof("Freeing devices from claim %v", claimUID)
52 | delete(s.Prepared, claimUID)
53 |
54 | // write prepared claims to file
55 | if err := WritePreparedClaimsToFile(s.PreparedClaimsFilePath, s.Prepared); err != nil {
56 | return fmt.Errorf("failed to write prepared claims to file: %v", err)
57 | }
58 |
59 | return nil
60 | }
61 |
62 | // GetOrCreatePreparedClaims reads a PreparedClaim from a file and deserializes it or creates the file.
63 | func GetOrCreatePreparedClaims(preparedClaimFilePath string) (ClaimPreparations, error) {
64 | if _, err := os.Stat(preparedClaimFilePath); os.IsNotExist(err) {
65 | klog.V(5).Infof("could not find file %v. Creating file", preparedClaimFilePath)
66 | f, err := os.OpenFile(preparedClaimFilePath, os.O_CREATE|os.O_WRONLY, 0600)
67 | if err != nil {
68 | return nil, fmt.Errorf("failed creating file %v. Err: %v", preparedClaimFilePath, err)
69 | }
70 | defer f.Close()
71 |
72 | if _, err := f.WriteString("{}"); err != nil {
73 | return nil, fmt.Errorf("failed writing to file %v. Err: %v", preparedClaimFilePath, err)
74 | }
75 |
76 | klog.V(5).Infof("empty prepared claims file created %v", preparedClaimFilePath)
77 |
78 | return make(ClaimPreparations), nil
79 | }
80 |
81 | return ReadPreparedClaimsFromFile(preparedClaimFilePath)
82 | }
83 |
84 | // ReadPreparedClaimToFile returns unmarshaled content for given prepared claims JSON file.
85 | func ReadPreparedClaimsFromFile(preparedClaimFilePath string) (ClaimPreparations, error) {
86 |
87 | preparedClaims := make(ClaimPreparations)
88 |
89 | preparedClaimsBytes, err := os.ReadFile(preparedClaimFilePath)
90 | if err != nil {
91 | klog.V(5).Infof("could not read prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err)
92 | return nil, fmt.Errorf("failed reading file %v. Err: %v", preparedClaimFilePath, err)
93 | }
94 |
95 | if err := json.Unmarshal(preparedClaimsBytes, &preparedClaims); err != nil {
96 | klog.V(5).Infof("Could not parse default prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err)
97 | return nil, fmt.Errorf("failed parsing file %v. Err: %v", preparedClaimFilePath, err)
98 | }
99 |
100 | return preparedClaims, nil
101 | }
102 |
103 | // WritePreparedClaimsToFile serializes PreparedClaims and writes it to a file.
104 | func WritePreparedClaimsToFile(preparedClaimFilePath string, preparedClaims ClaimPreparations) error {
105 | if preparedClaims == nil {
106 | preparedClaims = ClaimPreparations{}
107 | }
108 | encodedPreparedClaims, err := json.MarshalIndent(preparedClaims, "", " ")
109 | if err != nil {
110 | return fmt.Errorf("prepared claims JSON encoding failed. Err: %v", err)
111 | }
112 | return os.WriteFile(preparedClaimFilePath, encodedPreparedClaims, 0600)
113 | }
114 |
--------------------------------------------------------------------------------
/pkg/plugintesthelpers/plugintesthelpers.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package plugintesthelpers
18 |
19 | import (
20 | "fmt"
21 | "os"
22 | "path"
23 | "testing"
24 |
25 | resourcev1 "k8s.io/api/resource/v1beta1"
26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27 | "k8s.io/apimachinery/pkg/types"
28 | )
29 |
30 | const (
31 | testRootPrefix = "test-*"
32 | )
33 |
34 | type TestDirsType struct {
35 | TestRoot string
36 | CdiRoot string
37 | KubeletPluginDir string
38 | KubeletPluginRegistryDir string
39 | SysfsRoot string
40 | DevfsRoot string
41 | }
42 |
43 | // NewTestDirs creates fake CDI root, sysfs, driverPlugin dirs and returns
44 | // them as a testDirsType or an error.
45 | func NewTestDirs(driverName string) (TestDirsType, error) {
46 | testRoot, err := os.MkdirTemp("", testRootPrefix)
47 | if err != nil {
48 | return TestDirsType{}, fmt.Errorf("failed creating test root dir: %v", err)
49 | }
50 |
51 | if err := os.Chmod(testRoot, 0755); err != nil {
52 | return TestDirsType{}, fmt.Errorf("failed changing permissions to test root dir: %v", err)
53 | }
54 | return NewTestDirsAt(testRoot, driverName)
55 | }
56 | func NewTestDirsAt(testRoot string, driverName string) (TestDirsType, error) {
57 | cdiRoot := path.Join(testRoot, "cdi")
58 | if err := os.MkdirAll(cdiRoot, 0755); err != nil {
59 | return TestDirsType{}, fmt.Errorf("failed creating fake CDI root dir: %v", err)
60 | }
61 |
62 | fakeSysfsRoot := path.Join(testRoot, "sysfs")
63 | if err := os.MkdirAll(fakeSysfsRoot, 0755); err != nil {
64 | return TestDirsType{}, fmt.Errorf("failed creating fake sysfs root dir: %v", err)
65 | }
66 |
67 | driverPluginRoot := path.Join(testRoot, "kubelet-plugin/plugins/", driverName)
68 | if err := os.MkdirAll(driverPluginRoot, 0755); err != nil {
69 | return TestDirsType{}, fmt.Errorf("failed creating fake driver plugin dir: %v", err)
70 | }
71 |
72 | driverRegistrarRoot := path.Join(testRoot, "kubelet-plugin/plugins_registry")
73 | if err := os.MkdirAll(driverRegistrarRoot, 0755); err != nil {
74 | return TestDirsType{}, fmt.Errorf("failed creating fake driver plugin dir: %v", err)
75 | }
76 |
77 | devfsRoot := path.Join(testRoot, "dev")
78 | if err := os.MkdirAll(devfsRoot, 0755); err != nil {
79 | return TestDirsType{}, fmt.Errorf("failed creating fake devfs dir: %v", err)
80 | }
81 |
82 | return TestDirsType{
83 | TestRoot: testRoot,
84 | CdiRoot: cdiRoot,
85 | SysfsRoot: fakeSysfsRoot,
86 | KubeletPluginDir: driverPluginRoot,
87 | KubeletPluginRegistryDir: driverRegistrarRoot,
88 | DevfsRoot: devfsRoot,
89 | }, nil
90 | }
91 |
92 | func CleanupTest(t *testing.T, testname string, testRoot string) {
93 | if err := os.RemoveAll(testRoot); err != nil {
94 | t.Logf("%v: could not cleanup temp directory %v: %v", testname, testRoot, err)
95 | }
96 | }
97 |
98 | func NewMonitoringClaim(claimNs, claimName, claimUID, requestName, driverName, pool string, allocatedDevices []string) *resourcev1.ResourceClaim {
99 | claim := NewClaim(claimNs, claimName, claimUID, requestName, driverName, pool, allocatedDevices)
100 | claim.Spec.Devices.Requests[0].AdminAccess = &[]bool{true}[0]
101 | claim.Spec.Devices.Requests[0].AllocationMode = "All"
102 |
103 | return claim
104 | }
105 |
106 | func NewClaim(claimNs, claimName, claimUID, requestName, driverName, pool string, allocatedDevices []string) *resourcev1.ResourceClaim {
107 | allocationResults := []resourcev1.DeviceRequestAllocationResult{}
108 | for _, deviceUID := range allocatedDevices {
109 | newDevice := resourcev1.DeviceRequestAllocationResult{
110 | Device: deviceUID,
111 | Request: requestName,
112 | Driver: driverName,
113 | Pool: pool,
114 | }
115 | allocationResults = append(allocationResults, newDevice)
116 | }
117 |
118 | alienDevice := resourcev1.DeviceRequestAllocationResult{
119 | Device: "numberOne",
120 | Request: "complimentaryRequest",
121 | Driver: "NonExistent",
122 | Pool: pool,
123 | }
124 | allocationResults = append(allocationResults, alienDevice)
125 |
126 | claim := &resourcev1.ResourceClaim{
127 | TypeMeta: metav1.TypeMeta{APIVersion: "resource.k8s.io/v1beta1", Kind: "ResourceClaim"},
128 | ObjectMeta: metav1.ObjectMeta{Namespace: claimNs, Name: claimName, UID: types.UID(claimUID)},
129 | Spec: resourcev1.ResourceClaimSpec{
130 | Devices: resourcev1.DeviceClaim{
131 | Requests: []resourcev1.DeviceRequest{
132 | {Name: requestName, DeviceClassName: driverName, Count: 1},
133 | {Name: "complimentaryRequest", DeviceClassName: "NonExistent"},
134 | },
135 | },
136 | },
137 | Status: resourcev1.ResourceClaimStatus{
138 | Allocation: &resourcev1.AllocationResult{
139 | Devices: resourcev1.DeviceAllocationResult{
140 | Results: allocationResults,
141 | },
142 | },
143 | },
144 | }
145 |
146 | return claim
147 | }
148 |
--------------------------------------------------------------------------------
/pkg/qat/cdi/cdi.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package cdi
18 |
19 | import (
20 | "fmt"
21 | "path"
22 |
23 | "k8s.io/klog/v2"
24 | cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
25 | cdispecs "tags.cncf.io/container-device-interface/specs-go"
26 |
27 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device"
28 | )
29 |
30 | const (
31 | CDIRoot = cdiapi.DefaultDynamicDir
32 | CDIVendor = "intel.com"
33 | CDIClass = "qat"
34 | CDIKind = CDIVendor + "/" + CDIClass
35 | )
36 |
37 | type CDI struct {
38 | cache *cdiapi.Cache
39 | }
40 |
41 | func New(cdidir string) (*CDI, error) {
42 |
43 | if err := cdiapi.Configure(cdiapi.WithSpecDirs(cdidir)); err != nil {
44 | return nil, fmt.Errorf("unable to refresh the CDI registry: %v", err)
45 | }
46 |
47 | cdiCache := cdiapi.GetDefaultCache()
48 |
49 | cdi := &CDI{
50 | cache: cdiCache,
51 | }
52 |
53 | return cdi, nil
54 | }
55 |
56 | func (c *CDI) getQatSpecs() []*cdiapi.Spec {
57 | qatSpecs := []*cdiapi.Spec{}
58 | for _, cdiSpec := range c.cache.GetVendorSpecs(CDIVendor) {
59 | if cdiSpec.Kind == CDIKind {
60 | qatSpecs = append(qatSpecs, cdiSpec)
61 | }
62 | }
63 | return qatSpecs
64 | }
65 |
66 | func (c *CDI) SyncDevices(vfdevices device.VFDevices) error {
67 | klog.V(5).Info("Syncing CDI devices")
68 |
69 | vfspec := &cdispecs.Spec{
70 | Kind: CDIKind,
71 | }
72 | vfspecname := cdiapi.GenerateSpecName(CDIVendor, CDIClass)
73 |
74 | for _, vendorspec := range c.getQatSpecs() {
75 | vendorspecname := path.Base(vendorspec.GetPath())
76 |
77 | if vendorspec.Kind != CDIKind {
78 | klog.V(5).Infof("Spec file %s is for other kind %s, skipping...", vendorspecname, vendorspec.Kind)
79 | continue
80 | }
81 |
82 | name := vfspecname + path.Ext(vendorspecname)
83 | if name == vendorspecname {
84 | klog.V(5).Infof("Adding rest of the devices to '%s'", name)
85 | vfspec = vendorspec.Spec
86 | }
87 |
88 | vendorspecupdate := false
89 | vendorspecdevices := []cdispecs.Device{}
90 |
91 | for _, vendordevice := range vendorspec.Devices {
92 | if _, exists := vfdevices[vendordevice.Name]; exists {
93 | klog.V(5).Infof("Vendor spec %s contains device name %s", vendorspecname, vendordevice.Name)
94 |
95 | delete(vfdevices, vendordevice.Name)
96 | vendorspecdevices = append(vendorspecdevices, vendordevice)
97 | } else {
98 | klog.Warningf("CDI device '%s' in spec file '%s' does not exist", vendordevice.Name, vendorspecname)
99 | vendorspecupdate = true
100 | }
101 | }
102 | if vendorspecupdate {
103 | // Update spec file that has a nonexistent device.
104 | klog.Infof("Updating spec file %s with existing devices", path.Base(vendorspec.GetPath()))
105 |
106 | vendorspec.Devices = vendorspecdevices
107 | err := c.cache.WriteSpec(vendorspec.Spec, vendorspecname)
108 | if err != nil {
109 | klog.Warningf("Failed to update existing CDI spec file %s: %v", vendorspecname, err)
110 | }
111 | }
112 | }
113 |
114 | if len(vfdevices) > 0 {
115 | return c.appendDevices(vfspec, vfdevices, vfspecname)
116 | }
117 |
118 | return nil
119 | }
120 |
121 | func (c *CDI) adddevicespec(spec *cdispecs.Spec, vfdevices device.VFDevices) error {
122 |
123 | for _, vf := range vfdevices {
124 | cdidevice := cdispecs.Device{
125 | Name: vf.UID(),
126 | ContainerEdits: cdispecs.ContainerEdits{
127 | DeviceNodes: []*cdispecs.DeviceNode{
128 | {Path: vf.DeviceNode(), Type: "c"},
129 | },
130 | },
131 | }
132 | spec.Devices = append(spec.Devices, cdidevice)
133 |
134 | klog.V(5).Infof("Added device %s name %s", cdidevice.ContainerEdits.DeviceNodes[0].Path, cdidevice.Name)
135 | }
136 | return nil
137 | }
138 |
139 | func (c *CDI) appendDevices(spec *cdispecs.Spec, vfdevices device.VFDevices, name string) error {
140 |
141 | klog.V(5).Info("Append CDI devices")
142 |
143 | if err := c.adddevicespec(spec, vfdevices); err != nil {
144 | return err
145 | }
146 |
147 | version, err := cdiapi.MinimumRequiredVersion(spec)
148 | if err != nil {
149 | return fmt.Errorf("minimum CDI spec version not found: %v", err)
150 | }
151 | spec.Version = version
152 |
153 | err = c.cache.WriteSpec(spec, name)
154 | if err != nil {
155 | return fmt.Errorf("failed to write CDI spec %s: %v", name, err)
156 | }
157 |
158 | klog.Infof("CDI %s: Kind %s, Version %v", name, spec.Kind, spec.Version)
159 | return nil
160 | }
161 |
162 | func (c *CDI) OverwriteDevices(vfdevices device.VFDevices) error {
163 | var err error
164 |
165 | klog.V(5).Info("Add/overwrite CDI devices")
166 |
167 | spec := &cdispecs.Spec{
168 | Kind: CDIKind,
169 | }
170 |
171 | name, err := cdiapi.GenerateNameForSpec(spec)
172 | if err != nil {
173 | return fmt.Errorf("spec name not created: %v", err)
174 | }
175 |
176 | return c.appendDevices(spec, vfdevices, name)
177 | }
178 |
--------------------------------------------------------------------------------
/pkg/qat/device/state.go:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2024 Intel Corporation
2 | * SPDX-License-Identifier: Apache-2.0
3 | */
4 |
5 | package device
6 |
7 | import (
8 | "encoding/json"
9 | "fmt"
10 | "os"
11 |
12 | "k8s.io/klog/v2"
13 | )
14 |
15 | // Map allocation id to VF device.
16 | type savedAllocations map[string][]string
17 |
18 | func (q *QATDevices) ReadStateOrCreateEmpty(statefile string) error {
19 | if statefile == "" {
20 | return nil
21 | }
22 |
23 | if _, err := os.Stat(statefile); os.IsNotExist(err) {
24 | f, err := os.OpenFile(statefile, os.O_CREATE|os.O_WRONLY, 0600)
25 | if err != nil {
26 | return fmt.Errorf("failed to create state file '%s': %v", statefile, err)
27 | }
28 | defer f.Close()
29 |
30 | if _, err := f.WriteString("{}"); err != nil {
31 | return fmt.Errorf("failed to write to state file '%s': %v", statefile, err)
32 | }
33 |
34 | return nil
35 | }
36 |
37 | return q.readState(statefile)
38 | }
39 |
40 | func (q *QATDevices) readState(statefile string) error {
41 | if statefile == "" {
42 | return nil
43 | }
44 |
45 | savedstatebytes, err := os.ReadFile(statefile)
46 | if err != nil {
47 | return fmt.Errorf("could not read state file '%s': %v", statefile, err)
48 | }
49 |
50 | saveddevices := make(savedAllocations, 0)
51 | if err := json.Unmarshal(savedstatebytes, &saveddevices); err != nil {
52 | return fmt.Errorf("failed parsing state file '%s': %v", statefile, err)
53 | }
54 |
55 | for allocatedby, vfdevices := range saveddevices {
56 | for _, vf := range vfdevices {
57 | _, _, err := q.Allocate(vf, Unset, allocatedby)
58 |
59 | if err != nil {
60 | klog.Errorf("Failed to restore VF device '%s' for '%s': %v", vf, allocatedby, err)
61 | continue
62 | }
63 |
64 | klog.V(5).Infof("Successfully restored VF device '%s' for '%s'", vf, allocatedby)
65 | }
66 | }
67 |
68 | return nil
69 | }
70 |
71 | func (q *QATDevices) SaveState(statefile string) error {
72 | if statefile == "" {
73 | return nil
74 | }
75 |
76 | saveddevices := make(savedAllocations, 0)
77 |
78 | for _, pf := range *q {
79 | for allocatedby, vfdevices := range pf.AllocatedDevices {
80 | vflist, exists := saveddevices[allocatedby]
81 | if !exists {
82 | vflist = make([]string, 0)
83 | }
84 |
85 | for deviceuid := range vfdevices {
86 | vflist = append(vflist, deviceuid)
87 | }
88 | saveddevices[allocatedby] = vflist
89 | }
90 | }
91 |
92 | encodedstate, err := json.MarshalIndent(saveddevices, "", " ")
93 | if err != nil {
94 | return fmt.Errorf("failed save state JSON encoding to file '%s': %v", statefile, err)
95 | }
96 |
97 | return os.WriteFile(statefile, encodedstate, 0600)
98 | }
99 |
--------------------------------------------------------------------------------
/pkg/version/version.go:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2023, Intel Corporation. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package version
18 |
19 | import (
20 | "runtime"
21 |
22 | "k8s.io/klog/v2"
23 | )
24 |
25 | // These are set during build time via -ldflags.
26 | var (
27 | driverVersion = "N/A"
28 | gitCommit = "N/A"
29 | buildDate = "N/A"
30 | )
31 |
32 | // GetVersion returns the version information of the driver.
33 | func PrintDriverVersion(apiGroupName string) {
34 | klog.Infof(`
35 | Driver Name: %v,
36 | Driver Version: %v,
37 | Git Commit: %v,
38 | Build Date: %v,
39 | Go Version: %v,
40 | Compiler: %v,
41 | Platform: %v/%v`,
42 | apiGroupName,
43 | driverVersion,
44 | gitCommit,
45 | buildDate,
46 | runtime.Version(),
47 | runtime.Compiler,
48 | runtime.GOOS,
49 | runtime.GOARCH,
50 | )
51 | }
52 |
--------------------------------------------------------------------------------
/qat.mk:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | QAT_VERSION ?= v0.2.0
17 | QAT_IMAGE_NAME ?= intel-qat-resource-driver
18 | QAT_IMAGE_VERSION ?= $(QAT_VERSION)
19 | QAT_IMAGE_TAG ?= $(REGISTRY)/$(QAT_IMAGE_NAME):$(QAT_IMAGE_VERSION)
20 |
21 | QAT_BINARIES = \
22 | bin/qat-showdevice \
23 | bin/kubelet-qat-plugin
24 |
25 | QAT_COMMON_SRC = \
26 | $(COMMON_SRC) \
27 | pkg/qat/device/*.go \
28 | pkg/qat/cdi/*.go
29 |
30 | QAT_LDFLAGS = ${LDFLAGS} -extldflags $(EXT_LDFLAGS) -X ${PKG}/pkg/version.driverVersion=${QAT_VERSION}
31 |
32 | .PHONY: qat
33 | qat: $(QAT_BINARIES)
34 |
35 | bin/qat-showdevice: cmd/qat-showdevice/*.go $(QAT_COMMON_SRC)
36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \
37 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/qat-showdevice
38 |
39 | bin/kubelet-qat-plugin: cmd/kubelet-qat-plugin/*.go $(QAT_COMMON_SRC)
40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \
41 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-qat-plugin
42 |
43 | .PHONY: qat-container-build
44 | qat-container-build: cleanall vendor
45 | @echo "Building QAT resource driver container..."
46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(QAT_IMAGE_TAG) \
47 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) -f Dockerfile.qat .
48 |
49 | .PHONY: qat-container-push
50 | qat-container-push: qat-container-build
51 | $(DOCKER) push $(QAT_IMAGE_TAG)
52 |
53 | .PHONY: e2e-qat
54 | e2e-qat:
55 | sed -i 's|\(intel/intel-qat-resource-driver:\)[^ ]*|\1devel|' deployments/qat/base/resource-driver.yaml
56 | go test -v ./test/e2e/... --clean-start=true -ginkgo.v -ginkgo.trace -ginkgo.show-node-events
57 |
--------------------------------------------------------------------------------
/test/e2e/dra_suite_test.go:
--------------------------------------------------------------------------------
1 | package e2e_test
2 |
3 | import (
4 | "context"
5 | "flag"
6 | "os"
7 | "testing"
8 |
9 | "github.com/onsi/ginkgo/v2"
10 | "github.com/onsi/gomega"
11 | v1 "k8s.io/api/core/v1"
12 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13 | "k8s.io/component-base/logs"
14 | "k8s.io/klog/v2"
15 | "k8s.io/kubernetes/test/e2e/framework"
16 | "k8s.io/kubernetes/test/e2e/framework/config"
17 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
18 |
19 | _ "github.com/intel/intel-resource-drivers-for-kubernetes/test/e2e/qat"
20 | )
21 |
22 | func init() {
23 | ginkgo.SynchronizedBeforeSuite(setupFirstNode, func(data []byte) {})
24 | }
25 |
26 | func setupFirstNode(ctx context.Context) []byte {
27 | c, err := framework.LoadClientset()
28 | if err != nil {
29 | framework.Failf("Error loading client: %v", err)
30 | }
31 |
32 | // Delete any namespaces except those created by the system. This ensures no
33 | // lingering resources are left over from a previous test run.
34 | if framework.TestContext.CleanStart {
35 | deleted, err2 := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */
36 | []string{
37 | metav1.NamespaceSystem,
38 | metav1.NamespaceDefault,
39 | metav1.NamespacePublic,
40 | v1.NamespaceNodeLease,
41 | "cert-manager",
42 | })
43 | if err2 != nil {
44 | framework.Failf("Error deleting orphaned namespaces: %v", err2)
45 | }
46 |
47 | framework.Logf("Waiting for deletion of the following namespaces: %v", deleted)
48 |
49 | if err2 = framework.WaitForNamespacesDeleted(ctx, c, deleted, e2epod.DefaultPodDeletionTimeout); err2 != nil {
50 | framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err2)
51 | }
52 | }
53 |
54 | return []byte{}
55 | }
56 | func TestDra(t *testing.T) {
57 | gomega.RegisterFailHandler(ginkgo.Fail)
58 | ginkgo.RunSpecs(t, "E2E DRA Drivers Suite")
59 | }
60 |
61 | func TestMain(m *testing.M) {
62 | klog.SetOutput(ginkgo.GinkgoWriter)
63 |
64 | logs.InitLogs()
65 | config.CopyFlags(config.Flags, flag.CommandLine)
66 | framework.RegisterCommonFlags(flag.CommandLine)
67 | framework.RegisterClusterFlags(flag.CommandLine)
68 | flag.Parse()
69 |
70 | // Register framework flags, then handle flags.
71 | framework.AfterReadingAllFlags(&framework.TestContext)
72 |
73 | // Now run the test suite.
74 | os.Exit(m.Run())
75 | }
76 |
--------------------------------------------------------------------------------
/test/e2e/utils/utils.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "os"
8 | "path/filepath"
9 |
10 | "k8s.io/kubernetes/test/e2e/framework"
11 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
12 | )
13 |
14 | // LocateRepoFile locates a file inside this repository.
15 | func LocateRepoFile(repopath string) (string, error) {
16 | root := os.Getenv("PLUGINS_REPO_DIR")
17 | if root != "" {
18 | path := filepath.Join(root, repopath)
19 | if _, err := os.Stat(path); !os.IsNotExist(err) {
20 | return path, nil
21 | }
22 | }
23 |
24 | currentDir, err := os.Getwd()
25 | if err != nil {
26 | return "", err
27 | }
28 |
29 | path := filepath.Join(currentDir, repopath)
30 | if _, err := os.Stat(path); !os.IsNotExist(err) {
31 | return path, nil
32 | }
33 |
34 | path = filepath.Join(currentDir, "../../"+repopath)
35 | if _, err := os.Stat(path); !os.IsNotExist(err) {
36 | return path, err
37 | }
38 |
39 | return "", errors.New("no file found, try to define PLUGINS_REPO_DIR pointing to the root of the repository")
40 | }
41 |
42 | // GetPodLogs returns the log of the container. If not possible to get logs, it returns the error message.
43 | func GetPodLogs(ctx context.Context, f *framework.Framework, podName, containerName string) string {
44 | log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, podName, containerName)
45 | if err != nil {
46 | return fmt.Sprintf("unable to get log from pod: %v", err)
47 | }
48 |
49 | return fmt.Sprintf("log output of the container %s in the pod %s:%s", containerName, podName, log)
50 | }
51 |
--------------------------------------------------------------------------------